@zack/pdf-to-text
Extract text from PDFs, either from a URL or base64-encoded PDF data.
Install
agentpm install @zack/pdf-to-text@0.1.1Load
import { load } from '@agentpm/sdk';
const t = await load('@zack/pdf-to-text@0.1.1');from agentpm import load
t = load("@zack/pdf-to-text@0.1.1")Weekly downloads
1
0%
Last publish
6d ago
v0.1.1
agent.json
{
"name": "pdf-to-text",
"version": "0.1.1",
"description": "Extract text from PDFs, either from a URL or base64-encoded PDF data.",
"files": [
"pdf_to_text/"
],
"entrypoint": {
"args": [
"-u",
"pdf_to_text/__main__.py"
],
"command": "python",
"timeout_ms": 120000
},
"inputs": {
"type": "object",
"required": [],
"properties": {
"pdf_url": {
"type": "string",
"format": "uri",
"description": "HTTP(S) URL of the PDF to download and extract text from."
},
"split_by": {
"enum": [
"page",
"chunk"
],
"type": "string",
"default": "page",
"description": "How to split the extracted text. 'page' returns per-page text; 'chunk' additionally returns contiguous text chunks."
},
"max_pages": {
"type": "integer",
"minimum": 1,
"description": "Maximum number of pages to process, starting from page 1."
},
"pdf_base64": {
"type": "string",
"description": "Base64-encoded PDF bytes. Use this when you already have the PDF contents."
},
"user_agent": {
"type": "string",
"description": "Optional User-Agent header when fetching pdf_url. Overrides default and any environment value."
},
"strip_whitespace": {
"type": "boolean",
"default": true,
"description": "If true, collapse excessive whitespace/newlines in extracted text."
}
},
"description": "Exactly one of pdf_url or pdf_base64 must be provided. This is enforced at runtime.",
"additionalProperties": false
},
"outputs": {
"oneOf": [
{
"type": "object",
"required": [
"ok",
"pages",
"raw_text"
],
"properties": {
"ok": {
"const": true
},
"pages": {
"type": "array",
"items": {
"type": "object",
"required": [
"page_number",
"text"
],
"properties": {
"text": {
"type": "string",
"description": "Extracted text for this page."
},
"page_number": {
"type": "integer",
"description": "1-based page index."
}
},
"additionalProperties": false
},
"description": "Per-page extracted text."
},
"chunks": {
"type": "array",
"items": {
"type": "object",
"required": [
"index",
"text"
],
"properties": {
"text": {
"type": "string",
"description": "Chunk text."
},
"index": {
"type": "integer",
"description": "0-based index of the chunk."
},
"end_page": {
"type": "integer",
"description": "1-based index of the last page that contributed text to this chunk."
},
"start_page": {
"type": "integer",
"description": "1-based index of the first page that contributed text to this chunk."
}
},
"additionalProperties": false
},
"description": "Optional text chunks when split_by='chunk'. Omitted when split_by='page'."
},
"metadata": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL used to fetch the PDF, if any."
},
"source": {
"type": "string",
"description": "Source type used for extraction, e.g. 'url' or 'base64'."
},
"truncated": {
"type": "boolean",
"description": "True if extraction stopped early due to max_pages."
},
"page_count": {
"type": "integer",
"description": "Total number of pages in the PDF (may be greater than pages.length if truncated by max_pages)."
}
},
"description": "Metadata about the extracted PDF.",
"additionalProperties": true
},
"raw_text": {
"type": "string",
"description": "All page texts concatenated together (typically separated by newlines)."
}
},
"additionalProperties": false
},
{
"type": "object",
"required": [
"ok",
"error"
],
"properties": {
"ok": {
"const": false
},
"error": {
"type": "object",
"required": [
"message"
],
"properties": {
"code": {
"type": "string",
"description": "Stable machine-readable error code (e.g. INPUT_INVALID, FETCH_FAILED, PARSE_FAILED)."
},
"details": {
"type": "object",
"description": "Optional structured context (e.g. HTTP status, URL, page index)."
},
"message": {
"type": "string",
"description": "Human-readable error message."
}
},
"additionalProperties": true
}
},
"additionalProperties": false
}
]
},
"readme": "README.md",
"license": {
"file": "LICENSE"
},
"runtime": {
"type": "python",
"version": "3.11"
}
}Compatibility
NodePython