@zack/web-page-extract
Fetch a web page and return cleaned content, metadata, and discovered links.
Install
agentpm install @zack/web-page-extract@0.1.1Load
import { load } from '@agentpm/sdk';
const t = await load('@zack/web-page-extract@0.1.1');from agentpm import load
t = load("@zack/web-page-extract@0.1.1")Weekly downloads
1
0%
Last publish
1d ago
v0.1.1
agent.json
{
"name": "web-page-extract",
"version": "0.1.1",
"description": "Fetch a web page and return cleaned content, metadata, and discovered links.",
"files": [
"dist/"
],
"entrypoint": {
"args": [
"dist/index.js"
],
"command": "node",
"timeout_ms": 30000
},
"inputs": {
"type": "object",
"required": [
"url"
],
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "HTTP(S) URL to fetch and extract."
},
"format": {
"enum": [
"markdown",
"text"
],
"type": "string",
"default": "markdown",
"description": "Preferred output format for the extracted page body."
},
"max_chars": {
"type": "integer",
"default": 40000,
"maximum": 200000,
"minimum": 1,
"description": "Maximum number of characters to return in the cleaned content."
},
"timeout_ms": {
"type": "integer",
"default": 15000,
"maximum": 120000,
"minimum": 1,
"description": "Fetch timeout in milliseconds."
},
"include_links": {
"type": "boolean",
"default": true,
"description": "Whether to include normalized links discovered in the page body."
}
},
"additionalProperties": false
},
"outputs": {
"oneOf": [
{
"type": "object",
"required": [
"ok",
"url",
"final_url",
"format",
"content",
"metadata"
],
"properties": {
"ok": {
"const": true,
"description": "True when the page was fetched and parsed successfully."
},
"url": {
"type": "string",
"description": "Original URL requested by the caller."
},
"links": {
"type": "array",
"items": {
"type": "object",
"required": [
"href"
],
"properties": {
"href": {
"type": "string",
"description": "Absolute URL for the discovered link."
},
"text": {
"type": "string",
"description": "Visible link text associated with the discovered URL, when available."
}
},
"additionalProperties": false
},
"description": "Normalized links discovered in the page body when include_links is true."
},
"title": {
"type": "string",
"description": "Best-effort page title derived from title tags or social metadata."
},
"byline": {
"type": "string",
"description": "Best-effort author or byline extracted from page metadata."
},
"format": {
"type": "string",
"description": "Output format used for the cleaned page body."
},
"content": {
"type": "string",
"description": "Cleaned main page content in the requested format."
},
"excerpt": {
"type": "string",
"description": "Short summary or description extracted from page metadata."
},
"metadata": {
"type": "object",
"description": "Additional extraction metadata such as truncation state and source size.",
"additionalProperties": true
},
"final_url": {
"type": "string",
"description": "Final URL after any redirects applied by the fetch layer."
},
"published_at": {
"type": "string",
"description": "Best-effort publish timestamp extracted from page metadata."
},
"canonical_url": {
"type": "string",
"description": "Canonical URL declared by the page, resolved against the fetched URL when present."
}
},
"additionalProperties": false
},
{
"type": "object",
"required": [
"ok",
"error"
],
"properties": {
"ok": {
"const": false,
"description": "False when the request or extraction failed."
},
"error": {
"type": "object",
"required": [
"code",
"message"
],
"properties": {
"code": {
"type": "string",
"description": "Stable machine-readable error code."
},
"details": {
"type": "object",
"description": "Optional structured context about the failure.",
"additionalProperties": true
},
"message": {
"type": "string",
"description": "Human-readable explanation of the failure."
}
},
"description": "Structured error returned by the tool.",
"additionalProperties": true
}
},
"additionalProperties": false
}
]
},
"readme": "README.md",
"license": {
"spdx": "MIT"
},
"runtime": {
"type": "node",
"version": "20"
}
}Compatibility
NodePython