@zack/robots-aware-crawl
Crawl a bounded set of pages while respecting robots.txt, depth limits, and URL filters.
Install
agentpm install @zack/robots-aware-crawl@0.1.1Load
import { load } from '@agentpm/sdk';
const t = await load('@zack/robots-aware-crawl@0.1.1');from agentpm import load
t = load("@zack/robots-aware-crawl@0.1.1")Weekly downloads
1
0%
Last publish
1d ago
v0.1.1
agent.json
{
"name": "robots-aware-crawl",
"version": "0.1.1",
"description": "Crawl a bounded set of pages while respecting robots.txt, depth limits, and URL filters.",
"files": [
"dist/"
],
"entrypoint": {
"args": [
"dist/index.js"
],
"command": "node",
"timeout_ms": 60000
},
"inputs": {
"type": "object",
"required": [
"start_urls"
],
"properties": {
"max_depth": {
"type": "integer",
"default": 1,
"maximum": 5,
"minimum": 0,
"description": "Maximum crawl depth relative to the start URLs."
},
"max_pages": {
"type": "integer",
"default": 10,
"maximum": 100,
"minimum": 1,
"description": "Maximum number of pages to fetch."
},
"start_urls": {
"type": "array",
"items": {
"type": "string",
"format": "uri",
"description": "One absolute URL to begin crawling from."
},
"description": "Seed URLs used to start the crawl."
},
"respect_robots": {
"type": "boolean",
"default": true,
"description": "Whether to fetch and honor robots.txt disallow rules."
},
"allowed_domains": {
"type": "array",
"items": {
"type": "string",
"description": "One allowed domain, such as example.com."
},
"description": "Optional domain allowlist for visited URLs."
},
"exclude_patterns": {
"type": "array",
"items": {
"type": "string",
"description": "One JavaScript regex pattern."
},
"description": "Optional regex patterns that prevent a URL from being crawled."
},
"include_patterns": {
"type": "array",
"items": {
"type": "string",
"description": "One JavaScript regex pattern."
},
"description": "Optional regex patterns that a URL must match to be crawled."
},
"same_origin_only": {
"type": "boolean",
"default": false,
"description": "Restrict crawled links to the same origin as the current seed URL."
}
},
"additionalProperties": false
},
"outputs": {
"oneOf": [
{
"type": "object",
"required": [
"ok",
"pages",
"visited_count",
"skipped",
"errors",
"metadata"
],
"properties": {
"ok": {
"const": true,
"description": "True when the crawl completed successfully."
},
"pages": {
"type": "array",
"items": {
"type": "object",
"description": "One crawled page with extracted metadata and discovered links.",
"additionalProperties": true
},
"description": "Fetched pages in crawl order."
},
"errors": {
"type": "array",
"items": {
"type": "object",
"description": "One crawl error entry.",
"additionalProperties": true
},
"description": "Fetch or parsing errors encountered during the crawl."
},
"skipped": {
"type": "array",
"items": {
"type": "object",
"description": "One skipped URL entry.",
"additionalProperties": true
},
"description": "URLs that were skipped, along with the reason."
},
"metadata": {
"type": "object",
"description": "Summary metadata about the crawl.",
"additionalProperties": true
},
"visited_count": {
"type": "integer",
"description": "Number of pages actually fetched."
}
},
"additionalProperties": false
},
{
"type": "object",
"required": [
"ok",
"error"
],
"properties": {
"ok": {
"const": false,
"description": "False when validation or the crawl setup failed."
},
"error": {
"type": "object",
"required": [
"code",
"message"
],
"properties": {
"code": {
"type": "string",
"description": "Stable machine-readable error code."
},
"details": {
"type": "object",
"description": "Optional structured context about the failure.",
"additionalProperties": true
},
"message": {
"type": "string",
"description": "Human-readable explanation of the failure."
}
},
"description": "Structured error returned by the tool.",
"additionalProperties": true
}
},
"additionalProperties": false
}
]
},
"readme": "README.md",
"license": {
"spdx": "MIT"
},
"runtime": {
"type": "node",
"version": "20"
}
}Compatibility
NodePython