AgentPM™

@zack/pdf-to-text

Extract text from PDFs, either from a URL or base64-encoded PDF data.

Install
agentpm install @zack/pdf-to-text@0.1.1
Load
Weekly downloads
1
0%
Last publish
6d ago
v0.1.1
agent.json
{
  "name": "pdf-to-text",
  "version": "0.1.1",
  "description": "Extract text from PDFs, either from a URL or base64-encoded PDF data.",
  "files": [
    "pdf_to_text/"
  ],
  "entrypoint": {
    "args": [
      "-u",
      "pdf_to_text/__main__.py"
    ],
    "command": "python",
    "timeout_ms": 120000
  },
  "inputs": {
    "type": "object",
    "required": [],
    "properties": {
      "pdf_url": {
        "type": "string",
        "format": "uri",
        "description": "HTTP(S) URL of the PDF to download and extract text from."
      },
      "split_by": {
        "enum": [
          "page",
          "chunk"
        ],
        "type": "string",
        "default": "page",
        "description": "How to split the extracted text. 'page' returns per-page text; 'chunk' additionally returns contiguous text chunks."
      },
      "max_pages": {
        "type": "integer",
        "minimum": 1,
        "description": "Maximum number of pages to process, starting from page 1."
      },
      "pdf_base64": {
        "type": "string",
        "description": "Base64-encoded PDF bytes. Use this when you already have the PDF contents."
      },
      "user_agent": {
        "type": "string",
        "description": "Optional User-Agent header when fetching pdf_url. Overrides default and any environment value."
      },
      "strip_whitespace": {
        "type": "boolean",
        "default": true,
        "description": "If true, collapse excessive whitespace/newlines in extracted text."
      }
    },
    "description": "Exactly one of pdf_url or pdf_base64 must be provided. This is enforced at runtime.",
    "additionalProperties": false
  },
  "outputs": {
    "oneOf": [
      {
        "type": "object",
        "required": [
          "ok",
          "pages",
          "raw_text"
        ],
        "properties": {
          "ok": {
            "const": true
          },
          "pages": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "page_number",
                "text"
              ],
              "properties": {
                "text": {
                  "type": "string",
                  "description": "Extracted text for this page."
                },
                "page_number": {
                  "type": "integer",
                  "description": "1-based page index."
                }
              },
              "additionalProperties": false
            },
            "description": "Per-page extracted text."
          },
          "chunks": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "index",
                "text"
              ],
              "properties": {
                "text": {
                  "type": "string",
                  "description": "Chunk text."
                },
                "index": {
                  "type": "integer",
                  "description": "0-based index of the chunk."
                },
                "end_page": {
                  "type": "integer",
                  "description": "1-based index of the last page that contributed text to this chunk."
                },
                "start_page": {
                  "type": "integer",
                  "description": "1-based index of the first page that contributed text to this chunk."
                }
              },
              "additionalProperties": false
            },
            "description": "Optional text chunks when split_by='chunk'. Omitted when split_by='page'."
          },
          "metadata": {
            "type": "object",
            "properties": {
              "url": {
                "type": "string",
                "description": "The URL used to fetch the PDF, if any."
              },
              "source": {
                "type": "string",
                "description": "Source type used for extraction, e.g. 'url' or 'base64'."
              },
              "truncated": {
                "type": "boolean",
                "description": "True if extraction stopped early due to max_pages."
              },
              "page_count": {
                "type": "integer",
                "description": "Total number of pages in the PDF (may be greater than pages.length if truncated by max_pages)."
              }
            },
            "description": "Metadata about the extracted PDF.",
            "additionalProperties": true
          },
          "raw_text": {
            "type": "string",
            "description": "All page texts concatenated together (typically separated by newlines)."
          }
        },
        "additionalProperties": false
      },
      {
        "type": "object",
        "required": [
          "ok",
          "error"
        ],
        "properties": {
          "ok": {
            "const": false
          },
          "error": {
            "type": "object",
            "required": [
              "message"
            ],
            "properties": {
              "code": {
                "type": "string",
                "description": "Stable machine-readable error code (e.g. INPUT_INVALID, FETCH_FAILED, PARSE_FAILED)."
              },
              "details": {
                "type": "object",
                "description": "Optional structured context (e.g. HTTP status, URL, page index)."
              },
              "message": {
                "type": "string",
                "description": "Human-readable error message."
              }
            },
            "additionalProperties": true
          }
        },
        "additionalProperties": false
      }
    ]
  },
  "readme": "README.md",
  "license": {
    "file": "LICENSE"
  },
  "runtime": {
    "type": "python",
    "version": "3.11"
  }
}
Compatibility
NodePython