TextActive
PDF Text Extraction
Extract all text from a PDF. Send as pdf_base64 (base64-encoded PDF, max ~10 MB decoded). Returns text (full concatenated text), pages array (per-page text + char_count), page_count, and metadata (title, author, creator). Encode with: Buffer.from(pdfBytes).toString('base64'). Ideal for RAG pipelines, document QA, or LLM ingestion.
Input Schema
{
"type": "object",
"required": [
"pdf_base64"
],
"properties": {
"pdf_base64": {
"type": "string",
"description": "Base64-encoded PDF file content. Decode a PDF file to base64 and pass it here. Max ~10 MB (unencoded)."
},
"max_pages": {
"type": "integer",
"default": 50,
"description": "Maximum number of pages to extract. Default: 50. Use to limit processing time for large PDFs."
}
}
}
Output Schema
{
"type": "object",
"required": [
"text",
"page_count",
"extracted_at"
],
"properties": {
"text": {
"type": "string",
"description": "Full extracted text from all pages, joined with newlines. Preserves paragraph structure where possible."
},
"page_count": {
"type": "integer",
"description": "Total number of pages in the PDF"
},
"pages": {
"type": "array",
"description": "Per-page text content (first max_pages pages)",
"items": {
"type": "object",
"properties": {
"page": {
"type": "integer",
"description": "Page number (1-based)"
},
"text": {
"type": "string",
"description": "Extracted text for this page"
},
"char_count": {
"type": "integer",
"description": "Number of characters on this page"
}
}
}
},
"metadata": {
"type": "object",
"description": "PDF document metadata (if available)",
"properties": {
"title": {
"type": "string"
},
"author": {
"type": "string"
},
"subject": {
"type": "string"
},
"creator": {
"type": "string"
},
"producer": {
"type": "string"
},
"creation_date": {
"type": "string"
}
}
},
"file_size_bytes": {
"type": "integer",
"description": "Size of the decoded PDF in bytes"
},
"extracted_at": {
"type": "string",
"format": "date-time",
"description": "ISO 8601 timestamp of extraction"
}
},
"example": {
"text": "Quarterly Financial Report Q1 2026\n\nExecutive Summary\nTotal revenue increased by 23% year-over-year...",
"page_count": 12,
"pages": [
{
"page": 1,
"text": "Quarterly Financial Report Q1 2026\n\nExecutive Summary",
"char_count": 54
},
{
"page": 2,
"text": "Table of Contents\n1. Revenue Overview\n2. Cost Analysis",
"char_count": 58
}
],
"metadata": {
"title": "Q1 2026 Financial Report",
"author": "Finance Department",
"creator": "Adobe Acrobat"
},
"file_size_bytes": 245760,
"extracted_at": "2026-04-10T14:00:00.000Z"
}
}