AgentPM™

@zack/web-page-extract

Fetch a web page and return cleaned content, metadata, and discovered links.

Install
agentpm install @zack/web-page-extract@0.1.1
Load
Weekly downloads
1
0%
Last publish
1d ago
v0.1.1
agent.json
{
  "name": "web-page-extract",
  "version": "0.1.1",
  "description": "Fetch a web page and return cleaned content, metadata, and discovered links.",
  "files": [
    "dist/"
  ],
  "entrypoint": {
    "args": [
      "dist/index.js"
    ],
    "command": "node",
    "timeout_ms": 30000
  },
  "inputs": {
    "type": "object",
    "required": [
      "url"
    ],
    "properties": {
      "url": {
        "type": "string",
        "format": "uri",
        "description": "HTTP(S) URL to fetch and extract."
      },
      "format": {
        "enum": [
          "markdown",
          "text"
        ],
        "type": "string",
        "default": "markdown",
        "description": "Preferred output format for the extracted page body."
      },
      "max_chars": {
        "type": "integer",
        "default": 40000,
        "maximum": 200000,
        "minimum": 1,
        "description": "Maximum number of characters to return in the cleaned content."
      },
      "timeout_ms": {
        "type": "integer",
        "default": 15000,
        "maximum": 120000,
        "minimum": 1,
        "description": "Fetch timeout in milliseconds."
      },
      "include_links": {
        "type": "boolean",
        "default": true,
        "description": "Whether to include normalized links discovered in the page body."
      }
    },
    "additionalProperties": false
  },
  "outputs": {
    "oneOf": [
      {
        "type": "object",
        "required": [
          "ok",
          "url",
          "final_url",
          "format",
          "content",
          "metadata"
        ],
        "properties": {
          "ok": {
            "const": true,
            "description": "True when the page was fetched and parsed successfully."
          },
          "url": {
            "type": "string",
            "description": "Original URL requested by the caller."
          },
          "links": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "href"
              ],
              "properties": {
                "href": {
                  "type": "string",
                  "description": "Absolute URL for the discovered link."
                },
                "text": {
                  "type": "string",
                  "description": "Visible link text associated with the discovered URL, when available."
                }
              },
              "additionalProperties": false
            },
            "description": "Normalized links discovered in the page body when include_links is true."
          },
          "title": {
            "type": "string",
            "description": "Best-effort page title derived from title tags or social metadata."
          },
          "byline": {
            "type": "string",
            "description": "Best-effort author or byline extracted from page metadata."
          },
          "format": {
            "type": "string",
            "description": "Output format used for the cleaned page body."
          },
          "content": {
            "type": "string",
            "description": "Cleaned main page content in the requested format."
          },
          "excerpt": {
            "type": "string",
            "description": "Short summary or description extracted from page metadata."
          },
          "metadata": {
            "type": "object",
            "description": "Additional extraction metadata such as truncation state and source size.",
            "additionalProperties": true
          },
          "final_url": {
            "type": "string",
            "description": "Final URL after any redirects applied by the fetch layer."
          },
          "published_at": {
            "type": "string",
            "description": "Best-effort publish timestamp extracted from page metadata."
          },
          "canonical_url": {
            "type": "string",
            "description": "Canonical URL declared by the page, resolved against the fetched URL when present."
          }
        },
        "additionalProperties": false
      },
      {
        "type": "object",
        "required": [
          "ok",
          "error"
        ],
        "properties": {
          "ok": {
            "const": false,
            "description": "False when the request or extraction failed."
          },
          "error": {
            "type": "object",
            "required": [
              "code",
              "message"
            ],
            "properties": {
              "code": {
                "type": "string",
                "description": "Stable machine-readable error code."
              },
              "details": {
                "type": "object",
                "description": "Optional structured context about the failure.",
                "additionalProperties": true
              },
              "message": {
                "type": "string",
                "description": "Human-readable explanation of the failure."
              }
            },
            "description": "Structured error returned by the tool.",
            "additionalProperties": true
          }
        },
        "additionalProperties": false
      }
    ]
  },
  "readme": "README.md",
  "license": {
    "spdx": "MIT"
  },
  "runtime": {
    "type": "node",
    "version": "20"
  }
}
Compatibility
NodePython