AgentPM™

@zack/robots-aware-crawl

Crawl a bounded set of pages while respecting robots.txt, depth limits, and URL filters.

Install
agentpm install @zack/robots-aware-crawl@0.1.1
Load
Weekly downloads
1
0%
Last publish
1d ago
v0.1.1
agent.json
{
  "name": "robots-aware-crawl",
  "version": "0.1.1",
  "description": "Crawl a bounded set of pages while respecting robots.txt, depth limits, and URL filters.",
  "files": [
    "dist/"
  ],
  "entrypoint": {
    "args": [
      "dist/index.js"
    ],
    "command": "node",
    "timeout_ms": 60000
  },
  "inputs": {
    "type": "object",
    "required": [
      "start_urls"
    ],
    "properties": {
      "max_depth": {
        "type": "integer",
        "default": 1,
        "maximum": 5,
        "minimum": 0,
        "description": "Maximum crawl depth relative to the start URLs."
      },
      "max_pages": {
        "type": "integer",
        "default": 10,
        "maximum": 100,
        "minimum": 1,
        "description": "Maximum number of pages to fetch."
      },
      "start_urls": {
        "type": "array",
        "items": {
          "type": "string",
          "format": "uri",
          "description": "One absolute URL to begin crawling from."
        },
        "description": "Seed URLs used to start the crawl."
      },
      "respect_robots": {
        "type": "boolean",
        "default": true,
        "description": "Whether to fetch and honor robots.txt disallow rules."
      },
      "allowed_domains": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "One allowed domain, such as example.com."
        },
        "description": "Optional domain allowlist for visited URLs."
      },
      "exclude_patterns": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "One JavaScript regex pattern."
        },
        "description": "Optional regex patterns that prevent a URL from being crawled."
      },
      "include_patterns": {
        "type": "array",
        "items": {
          "type": "string",
          "description": "One JavaScript regex pattern."
        },
        "description": "Optional regex patterns that a URL must match to be crawled."
      },
      "same_origin_only": {
        "type": "boolean",
        "default": false,
        "description": "Restrict crawled links to the same origin as the current seed URL."
      }
    },
    "additionalProperties": false
  },
  "outputs": {
    "oneOf": [
      {
        "type": "object",
        "required": [
          "ok",
          "pages",
          "visited_count",
          "skipped",
          "errors",
          "metadata"
        ],
        "properties": {
          "ok": {
            "const": true,
            "description": "True when the crawl completed successfully."
          },
          "pages": {
            "type": "array",
            "items": {
              "type": "object",
              "description": "One crawled page with extracted metadata and discovered links.",
              "additionalProperties": true
            },
            "description": "Fetched pages in crawl order."
          },
          "errors": {
            "type": "array",
            "items": {
              "type": "object",
              "description": "One crawl error entry.",
              "additionalProperties": true
            },
            "description": "Fetch or parsing errors encountered during the crawl."
          },
          "skipped": {
            "type": "array",
            "items": {
              "type": "object",
              "description": "One skipped URL entry.",
              "additionalProperties": true
            },
            "description": "URLs that were skipped, along with the reason."
          },
          "metadata": {
            "type": "object",
            "description": "Summary metadata about the crawl.",
            "additionalProperties": true
          },
          "visited_count": {
            "type": "integer",
            "description": "Number of pages actually fetched."
          }
        },
        "additionalProperties": false
      },
      {
        "type": "object",
        "required": [
          "ok",
          "error"
        ],
        "properties": {
          "ok": {
            "const": false,
            "description": "False when validation or the crawl setup failed."
          },
          "error": {
            "type": "object",
            "required": [
              "code",
              "message"
            ],
            "properties": {
              "code": {
                "type": "string",
                "description": "Stable machine-readable error code."
              },
              "details": {
                "type": "object",
                "description": "Optional structured context about the failure.",
                "additionalProperties": true
              },
              "message": {
                "type": "string",
                "description": "Human-readable explanation of the failure."
              }
            },
            "description": "Structured error returned by the tool.",
            "additionalProperties": true
          }
        },
        "additionalProperties": false
      }
    ]
  },
  "readme": "README.md",
  "license": {
    "spdx": "MIT"
  },
  "runtime": {
    "type": "node",
    "version": "20"
  }
}
Compatibility
NodePython