Files

2672 lines
63 KiB
Python
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""Configuration options for the conversion API."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Any, TypedDict
from ._kreuzberg import (
ChunkerType, # noqa: TC001
CodeContentMode, # noqa: TC001
ExecutionProviderType, # noqa: TC001
ExtractionMethod, # noqa: TC001
HtmlTheme, # noqa: TC001
KeywordAlgorithm, # noqa: TC001
OcrElementLevel, # noqa: TC001
ReductionLevel, # noqa: TC001
ResultFormat, # noqa: TC001
TableModel, # noqa: TC001
TextDirection, # noqa: TC001
)
if TYPE_CHECKING:
from ._kreuzberg import (
ArchiveEntry, # noqa: F401
CellChange, # noqa: F401
Chunk, # noqa: F401
ContributorRole, # noqa: F401
DbfFieldInfo, # noqa: F401
DjotContent, # noqa: F401
DocumentNode, # noqa: F401
DocumentRelationship, # noqa: F401
DocumentRevision, # noqa: F401
Element, # noqa: F401
ErrorMetadata, # noqa: F401
ExtractedImage, # noqa: F401
ExtractedUri, # noqa: F401
GridCell, # noqa: F401
HeaderMetadata, # noqa: F401
ImageMetadataType, # noqa: F401
ImagePreprocessingMetadata, # noqa: F401
Keyword, # noqa: F401
LinkMetadata, # noqa: F401
OcrPipelineConfig, # noqa: F401
OcrRotation, # noqa: F401
PageContent, # noqa: F401
PageStructure, # noqa: F401
PdfAnnotation, # noqa: F401
ProcessingWarning, # noqa: F401
StructuredData, # noqa: F401
StructuredExtractionConfig, # noqa: F401
YearRange, # noqa: F401
)
class ListType(str, Enum):
"""Type of list detection."""
bullet = "bullet"
numbered = "numbered"
lettered = "lettered"
indented = "indented"
class OcrBackendType(str, Enum):
"""OCR backend types."""
tesseract = "tesseract"
easy_ocr = "easy_ocr"
paddle_ocr = "paddle_ocr"
custom = "custom"
class ProcessingStage(str, Enum):
"""Processing stages for post-processors."""
early = "early"
middle = "middle"
late = "late"
class PdfAnnotationType(str, Enum):
"""Type of PDF annotation."""
text = "text"
highlight = "highlight"
link = "link"
stamp = "stamp"
underline = "underline"
strike_out = "strike_out"
other = "other"
class BlockType(str, Enum):
"""Types of block-level elements in Djot."""
paragraph = "paragraph"
heading = "heading"
blockquote = "blockquote"
code_block = "code_block"
list_item = "list_item"
ordered_list = "ordered_list"
bullet_list = "bullet_list"
task_list = "task_list"
definition_list = "definition_list"
definition_term = "definition_term"
definition_description = "definition_description"
div = "div"
section = "section"
thematic_break = "thematic_break"
raw_block = "raw_block"
math_display = "math_display"
class InlineType(str, Enum):
"""Types of inline elements in Djot."""
text = "text"
strong = "strong"
emphasis = "emphasis"
highlight = "highlight"
subscript = "subscript"
superscript = "superscript"
insert = "insert"
delete = "delete"
code = "code"
link = "link"
image = "image"
span = "span"
math = "math"
raw_inline = "raw_inline"
footnote_ref = "footnote_ref"
symbol = "symbol"
class RelationshipKind(str, Enum):
"""Semantic kind of a relationship between document elements."""
footnote_reference = "footnote_reference"
citation_reference = "citation_reference"
internal_link = "internal_link"
caption = "caption"
label = "label"
toc_entry = "toc_entry"
cross_reference = "cross_reference"
class ContentLayer(str, Enum):
"""Content layer classification for document nodes."""
body = "body"
header = "header"
footer = "footer"
footnote = "footnote"
class ChunkType(str, Enum):
"""Semantic structural classification of a text chunk."""
heading = "heading"
party_list = "party_list"
definitions = "definitions"
operative_clause = "operative_clause"
signature_block = "signature_block"
schedule = "schedule"
table_like = "table_like"
formula = "formula"
code_block = "code_block"
image = "image"
org_chart = "org_chart"
diagram = "diagram"
unknown = "unknown"
class ImageKind(str, Enum):
"""Heuristic classification of what an image likely depicts."""
photograph = "photograph"
diagram = "diagram"
chart = "chart"
drawing = "drawing"
text_block = "text_block"
decoration = "decoration"
logo = "logo"
icon = "icon"
tile_fragment = "tile_fragment"
mask = "mask"
page_raster = "page_raster"
unknown = "unknown"
class ElementType(str, Enum):
"""Semantic element type classification."""
title_ = "title"
narrative_text = "narrative_text"
heading = "heading"
list_item = "list_item"
table = "table"
image = "image"
page_break = "page_break"
code_block = "code_block"
block_quote = "block_quote"
footer = "footer"
header = "header"
class LinkType(str, Enum):
"""Link type classification."""
anchor = "anchor"
internal = "internal"
external = "external"
email = "email"
phone = "phone"
other = "other"
class ImageType(str, Enum):
"""Image type classification."""
data_uri = "data-uri"
inline_svg = "inline-svg"
external = "external"
relative = "relative"
class StructuredDataType(str, Enum):
"""Structured data type classification."""
json_ld = "json-ld"
microdata = "microdata"
rd_fa = "rdfa"
class PageUnitType(str, Enum):
"""Type of paginated unit in a document."""
page = "page"
slide = "slide"
sheet = "sheet"
class RevisionKind(str, Enum):
"""Semantic classification of a tracked change."""
insertion = "insertion"
deletion = "deletion"
format_change = "format_change"
comment = "comment"
class UriKind(str, Enum):
"""Semantic classification of an extracted URI."""
hyperlink = "hyperlink"
image = "image"
anchor = "anchor"
citation = "citation"
reference = "reference"
email = "email"
class PSMMode(str, Enum):
"""Page Segmentation Mode for Tesseract OCR."""
osd_only = "osd_only"
auto_osd = "auto_osd"
auto_only = "auto_only"
auto = "auto"
single_column = "single_column"
single_block_vertical = "single_block_vertical"
single_block = "single_block"
single_line = "single_line"
single_word = "single_word"
circle_word = "circle_word"
single_char = "single_char"
class PaddleLanguage(str, Enum):
"""Supported languages in PaddleOCR."""
english = "english"
chinese = "chinese"
japanese = "japanese"
korean = "korean"
german = "german"
french = "french"
latin = "latin"
cyrillic = "cyrillic"
traditional_chinese = "traditional_chinese"
thai = "thai"
greek = "greek"
east_slavic = "east_slavic"
arabic = "arabic"
devanagari = "devanagari"
tamil = "tamil"
telugu = "telugu"
class LayoutClass(str, Enum):
"""The 17 canonical document layout classes."""
caption = "caption"
footnote = "footnote"
formula = "formula"
list_item = "list_item"
page_footer = "page_footer"
page_header = "page_header"
picture = "picture"
section_header = "section_header"
table = "table"
text = "text"
title_ = "title"
document_index = "document_index"
code = "code"
checkbox_selected = "checkbox_selected"
checkbox_unselected = "checkbox_unselected"
form = "form"
key_value_region = "key_value_region"
@dataclass(frozen=True, slots=True)
class AccelerationConfig:
"""Hardware acceleration configuration for ONNX Runtime models."""
provider: ExecutionProviderType | str = "auto"
"""Execution provider to use for ONNX inference."""
device_id: int = 0
"""GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto."""
@dataclass(frozen=True, slots=True)
class ContentFilterConfig:
"""Cross-extractor content filtering configuration."""
include_headers: bool = False
"""Include running headers in extraction output."""
include_footers: bool = False
"""Include running footers in extraction output."""
strip_repeating_text: bool = True
"""Enable the heuristic cross-page repeating text detector."""
include_watermarks: bool = False
"""Include watermark text in extraction output."""
@dataclass(frozen=True, slots=True)
class EmailConfig:
"""Configuration for email extraction."""
msg_fallback_codepage: int | None = None
"""Windows codepage number to use when an MSG file contains no codepage property. Defaults to `None`, which falls back to windows-1252."""
class ExtractionConfig(TypedDict, total=False):
"""Main extraction configuration."""
use_cache: bool
"""Enable caching of extraction results"""
enable_quality_processing: bool
"""Enable quality post-processing"""
ocr: OcrConfig | None
"""OCR configuration (None = OCR disabled)"""
force_ocr: bool
"""Force OCR even for searchable PDFs"""
force_ocr_pages: list[int] | None
"""Force OCR on specific pages only (1-indexed page numbers, must be >= 1)."""
disable_ocr: bool
"""Disable OCR entirely, even for images."""
chunking: ChunkingConfig | None
"""Text chunking configuration (None = chunking disabled)"""
content_filter: ContentFilterConfig | None
"""Content filtering configuration (None = use extractor defaults)."""
images: ImageExtractionConfig | None
"""Image extraction configuration (None = no image extraction)"""
pdf_options: PdfConfig | None
"""PDF-specific options (None = use defaults)"""
token_reduction: TokenReductionOptions | None
"""Token reduction configuration (None = no token reduction)"""
language_detection: LanguageDetectionConfig | None
"""Language detection configuration (None = no language detection)"""
pages: PageConfig | None
"""Page extraction configuration (None = no page tracking)"""
keywords: KeywordConfig | None
"""Keyword extraction configuration (None = no keyword extraction)"""
postprocessor: PostProcessorConfig | None
"""Post-processor configuration (None = use defaults)"""
html_options: str | None
"""HTML to Markdown conversion options (None = use defaults)"""
html_output: HtmlOutputConfig | None
"""Styled HTML output configuration."""
extraction_timeout_secs: int | None
"""Default per-file timeout in seconds for batch extraction."""
max_concurrent_extractions: int | None
"""Maximum concurrent extractions in batch operations (None = (num_cpus x 1.5).ceil())."""
result_format: ResultFormat | str
"""Result structure format"""
security_limits: SecurityLimits | None
"""Security limits for archive extraction."""
max_embedded_file_bytes: int | None
"""Maximum uncompressed size in bytes for a single embedded file before recursive extraction is attempted (default: 50 MiB)."""
output_format: OutputFormat
"""Content text format (default: Plain)."""
layout: LayoutDetectionConfig | None
"""Layout detection configuration (None = layout detection disabled)."""
use_layout_for_markdown: bool
"""Run layout detection on the non-OCR PDF markdown path."""
include_document_structure: bool
"""Enable structured document tree output."""
acceleration: AccelerationConfig | None
"""Hardware acceleration configuration for ONNX Runtime models."""
cache_namespace: str | None
"""Cache namespace for tenant isolation."""
cache_ttl_secs: int | None
"""Per-request cache TTL in seconds."""
email: EmailConfig | None
"""Email extraction configuration (None = use defaults)."""
concurrency: str | None
"""Concurrency limits for constrained environments (None = use defaults)."""
max_archive_depth: int
"""Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive extraction (legacy behavior)."""
tree_sitter: TreeSitterConfig | None
"""Tree-sitter language pack configuration (None = tree-sitter disabled)."""
structured_extraction: StructuredExtractionConfig | None
"""Structured extraction via LLM (None = disabled)."""
cancel_token: str | None
"""Cancellation token for this extraction (None = no external cancellation)."""
@dataclass(frozen=True, slots=True)
class FileExtractionConfig:
"""Per-file extraction configuration overrides for batch processing."""
enable_quality_processing: bool | None = None
"""Override quality post-processing for this file."""
ocr: OcrConfig | None = None
"""Override OCR configuration for this file (None in the Option = use batch default)."""
force_ocr: bool | None = None
"""Override force OCR for this file."""
force_ocr_pages: list[int] | None = None
"""Override force OCR pages for this file (1-indexed page numbers)."""
disable_ocr: bool | None = None
"""Override disable OCR for this file."""
chunking: ChunkingConfig | None = None
"""Override chunking configuration for this file."""
content_filter: ContentFilterConfig | None = None
"""Override content filtering configuration for this file."""
images: ImageExtractionConfig | None = None
"""Override image extraction configuration for this file."""
pdf_options: PdfConfig | None = None
"""Override PDF options for this file."""
token_reduction: TokenReductionOptions | None = None
"""Override token reduction for this file."""
language_detection: LanguageDetectionConfig | None = None
"""Override language detection for this file."""
pages: PageConfig | None = None
"""Override page extraction for this file."""
keywords: KeywordConfig | None = None
"""Override keyword extraction for this file."""
postprocessor: PostProcessorConfig | None = None
"""Override post-processor for this file."""
html_options: str | None = None
"""Override HTML conversion options for this file."""
result_format: ResultFormat | str | None = None
"""Override result format for this file."""
output_format: OutputFormat | None = None
"""Override output content format for this file."""
include_document_structure: bool | None = None
"""Override document structure output for this file."""
layout: LayoutDetectionConfig | None = None
"""Override layout detection for this file."""
timeout_secs: int | None = None
"""Override per-file extraction timeout in seconds."""
tree_sitter: TreeSitterConfig | None = None
"""Override tree-sitter configuration for this file."""
structured_extraction: StructuredExtractionConfig | None = None
"""Override structured extraction configuration for this file."""
@dataclass(frozen=True, slots=True)
class ImageExtractionConfig:
"""Image extraction configuration."""
extract_images: bool = True
"""Extract images from documents"""
target_dpi: int = 300
"""Target DPI for image normalization"""
max_image_dimension: int = 4096
"""Maximum dimension for images (width or height)"""
inject_placeholders: bool = True
"""Whether to inject image reference placeholders into markdown output. When `true` (default), image references like `![Image 1](embedded:p1_i0)` are appended to the markdown. Set to `false` to extract images as data without polluting the markdown output."""
auto_adjust_dpi: bool = True
"""Automatically adjust DPI based on image content"""
min_dpi: int = 72
"""Minimum DPI threshold"""
max_dpi: int = 600
"""Maximum DPI threshold"""
max_images_per_page: int | None = None
"""Maximum number of image objects to extract per PDF page."""
classify: bool = True
"""When `true` (default), extracted images are classified by kind and grouped into clusters where they appear to belong to one figure."""
include_page_rasters: bool = False
"""When `true`, full-page renders produced during OCR preprocessing are captured and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`."""
run_ocr_on_images: bool = True
"""Run OCR on extracted images and include the recognized text in the document content."""
ocr_text_only: bool = False
"""When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images` is also `true`."""
append_ocr_text: bool = False
"""When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder in the rendered output."""
@dataclass(frozen=True, slots=True)
class TokenReductionOptions:
"""Token reduction configuration."""
mode: str = ""
"""Reduction mode: "off", "light", "moderate", "aggressive", "maximum" """
preserve_important_words: bool = True
"""Preserve important words (capitalized, technical terms)"""
@dataclass(frozen=True, slots=True)
class LanguageDetectionConfig:
"""Language detection configuration."""
enabled: bool = True
"""Enable language detection"""
min_confidence: float = 0.8
"""Minimum confidence threshold (0.0-1.0)"""
detect_multiple: bool = False
"""Detect multiple languages in the document"""
@dataclass(frozen=True, slots=True)
class HtmlOutputConfig:
"""Configuration for styled HTML output."""
css: str | None = None
"""Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file` content when both are set."""
css_file: str | None = None
"""Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both are set."""
theme: HtmlTheme | str = "unstyled"
"""Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]."""
class_prefix: str = ""
"""CSS class prefix applied to every emitted class name."""
embed_css: bool = True
"""When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening `<div class="{prefix}doc">`."""
@dataclass(frozen=True, slots=True)
class LayoutDetectionConfig:
"""Layout detection configuration."""
confidence_threshold: float | None = None
"""Confidence threshold override (None = use model default)."""
apply_heuristics: bool = True
"""Whether to apply postprocessing heuristics (default: true)."""
table_model: TableModel | str = "tatr"
"""Table structure recognition model."""
acceleration: AccelerationConfig | None = None
"""Hardware acceleration for ONNX models (layout detection + table structure)."""
@dataclass(frozen=True, slots=True)
class LlmConfig:
"""Configuration for an LLM provider/model via liter-llm."""
model: str = ""
"""Provider/model string using liter-llm routing format."""
api_key: str | None = None
"""API key for the provider. When `None`, liter-llm falls back to the provider's standard environment variable (e.g., `OPENAI_API_KEY`)."""
base_url: str | None = None
"""Custom base URL override for the provider endpoint."""
timeout_secs: int | None = None
"""Request timeout in seconds (default: 60)."""
max_retries: int | None = None
"""Maximum retry attempts (default: 3)."""
temperature: float | None = None
"""Sampling temperature for generation tasks."""
max_tokens: int | None = None
"""Maximum tokens to generate."""
@dataclass(frozen=True, slots=True)
class OcrQualityThresholds:
"""Quality thresholds for OCR fallback decisions and pipeline quality gating."""
min_total_non_whitespace: int = 64
"""Minimum total non-whitespace characters to consider text substantive."""
min_non_whitespace_per_page: float = 32
"""Minimum non-whitespace characters per page on average."""
min_meaningful_word_len: int = 4
"""Minimum character count for a word to be "meaningful"."""
min_meaningful_words: int = 3
"""Minimum count of meaningful words before text is accepted."""
min_alnum_ratio: float = 0.3
"""Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric)."""
min_garbage_chars: int = 5
"""Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback."""
max_fragmented_word_ratio: float = 0.6
"""Maximum fraction of short (1-2 char) words before text is considered fragmented."""
critical_fragmented_word_ratio: float = 0.8
"""Critical fragmentation threshold -- triggers OCR regardless of meaningful words. Normal English text has ~20-30% short words. 80%+ is definitive garbage."""
min_avg_word_length: float = 2
"""Minimum average word length. Below this with enough words indicates garbled extraction."""
min_words_for_avg_length_check: int = 50
"""Minimum word count before average word length check applies."""
min_consecutive_repeat_ratio: float = 0.08
"""Minimum consecutive word repetition ratio to detect column scrambling."""
min_words_for_repeat_check: int = 50
"""Minimum word count before consecutive repetition check is applied."""
substantive_min_chars: int = 100
"""Minimum character count for "substantive markdown" OCR skip gate."""
non_text_min_chars: int = 20
"""Minimum character count for "non-text content" OCR skip gate."""
alnum_ws_ratio_threshold: float = 0.4
"""Alphanumeric+whitespace ratio threshold for skip decisions."""
pipeline_min_quality: float = 0.5
"""Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the result from a backend scores below this, try the next backend."""
@dataclass(frozen=True, slots=True)
class OcrConfig:
"""OCR configuration."""
enabled: bool = True
"""Whether OCR is enabled."""
backend: str = ""
"""OCR backend: tesseract, easyocr, paddleocr"""
language: str = ""
"""Language code (e.g., "eng", "deu")"""
tesseract_config: TesseractConfig | None = None
"""Tesseract-specific configuration (optional)"""
output_format: OutputFormat | None = None
"""Output format for OCR results (optional, for format conversion)"""
paddle_ocr_config: str | None = None
"""PaddleOCR-specific configuration (optional, JSON passthrough)"""
backend_options: str | None = None
"""Arbitrary per-call options passed through to the backend unchanged."""
element_config: OcrElementConfig | None = None
"""OCR element extraction configuration"""
quality_thresholds: OcrQualityThresholds | None = None
"""Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled defaults (matching previous hardcoded behavior)."""
pipeline: OcrPipelineConfig | None = None
"""Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR backends based on output quality. When None, uses the single `backend` field (same as today)."""
auto_rotate: bool = False
"""Enable automatic page rotation based on orientation detection."""
vlm_config: LlmConfig | None = None
"""VLM (Vision Language Model) OCR configuration."""
vlm_prompt: str | None = None
"""Custom Jinja2 prompt template for VLM OCR."""
acceleration: AccelerationConfig | None = None
"""Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection)."""
tessdata_bytes: dict[str, bytes] | None = None
"""Caller-supplied Tesseract `traineddata` bytes per language code."""
@dataclass(frozen=True, slots=True)
class PageConfig:
"""Page extraction and tracking configuration."""
extract_pages: bool = False
"""Extract pages as separate array (ExtractionResult.pages)"""
insert_page_markers: bool = False
"""Insert page markers in main content string"""
marker_format: str = "\n\n<!-- PAGE {page_num} -->\n\n"
"""Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n" """
@dataclass(frozen=True, slots=True)
class PdfConfig:
"""PDF-specific configuration."""
extract_images: bool = False
"""Extract images from PDF"""
extract_tables: bool = True
"""Extract tables from PDF."""
passwords: list[str] | None = None
"""List of passwords to try when opening encrypted PDFs"""
extract_metadata: bool = True
"""Extract PDF metadata"""
hierarchy: HierarchyConfig | None = None
"""Hierarchy extraction configuration (None = hierarchy extraction disabled)"""
extract_annotations: bool = False
"""Extract PDF annotations (text notes, highlights, links, stamps). Default: false"""
top_margin_fraction: float | None = None
"""Top margin fraction (0.0-1.0) of page height to exclude headers/running heads. Default: 0.06 (6%)"""
bottom_margin_fraction: float | None = None
"""Bottom margin fraction (0.0-1.0) of page height to exclude footers/page numbers. Default: 0.05 (5%)"""
allow_single_column_tables: bool = False
"""Allow single-column pseudo tables in extraction results."""
ocr_inline_images: bool = False
"""Perform OCR on inline images extracted from PDF pages and attach the recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract to be available; if `ExtractionConfig.ocr` is `None` the extractor falls back to `TesseractConfig::default()`. Per-image failures degrade gracefully (the image is returned without OCR text rather than failing the whole extraction). Default: `false`."""
@dataclass(frozen=True, slots=True)
class HierarchyConfig:
"""Hierarchy extraction configuration for PDF text structure analysis."""
enabled: bool = True
"""Enable hierarchy extraction"""
k_clusters: int = 3
"""Number of font size clusters to use for hierarchy levels (1-7)"""
include_bbox: bool = True
"""Include bounding box information in hierarchy blocks"""
ocr_coverage_threshold: float | None = None
"""OCR coverage threshold for smart OCR triggering (0.0-1.0)"""
@dataclass(frozen=True, slots=True)
class PostProcessorConfig:
"""Post-processor configuration."""
enabled: bool = True
"""Enable post-processors"""
enabled_processors: list[str] | None = None
"""Whitelist of processor names to run (None = all enabled)"""
disabled_processors: list[str] | None = None
"""Blacklist of processor names to skip (None = none disabled)"""
enabled_set: list[str] | None = None
"""Pre-computed AHashSet for O(1) enabled processor lookup"""
disabled_set: list[str] | None = None
"""Pre-computed AHashSet for O(1) disabled processor lookup"""
@dataclass(frozen=True, slots=True)
class ChunkingConfig:
"""Chunking configuration."""
max_characters: int = 1000
"""Maximum size per chunk (in units determined by `sizing`)."""
overlap: int = 200
"""Overlap between chunks (in units determined by `sizing`)."""
trim: bool = True
"""Whether to trim whitespace from chunk boundaries."""
chunker_type: ChunkerType | str = "text"
"""Type of chunker to use (Text or Markdown)."""
embedding: EmbeddingConfig | None = None
"""Optional embedding configuration for chunk embeddings."""
preset: str | None = None
"""Use a preset configuration (overrides individual settings if provided)."""
sizing: ChunkSizing | None = None
"""How to measure chunk size."""
prepend_heading_context: bool = False
"""When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string."""
topic_threshold: float | None = None
"""Optional cosine similarity threshold for semantic topic boundary detection."""
@dataclass(frozen=True, slots=True)
class EmbeddingConfig:
"""Embedding configuration for text chunks."""
model: EmbeddingModelType | None = None
"""The embedding model to use (defaults to "balanced" preset if not specified)"""
normalize: bool = True
"""Whether to normalize embedding vectors (recommended for cosine similarity)"""
batch_size: int = 32
"""Batch size for embedding generation"""
show_download_progress: bool = False
"""Show model download progress"""
cache_dir: str | None = None
"""Custom cache directory for model files"""
acceleration: AccelerationConfig | None = None
"""Hardware acceleration for the embedding ONNX model."""
max_embed_duration_secs: int | None = None
"""Maximum wall-clock duration (in seconds) for a single `embed()` call when using [`EmbeddingModelType::Plugin`]."""
@dataclass(frozen=True, slots=True)
class TreeSitterConfig:
"""Configuration for tree-sitter language pack integration."""
enabled: bool = True
"""Enable code intelligence processing (default: true)."""
cache_dir: str | None = None
"""Custom cache directory for downloaded grammars."""
languages: list[str] | None = None
"""Languages to pre-download on init (e.g., `["python", "rust"]`)."""
groups: list[str] | None = None
"""Language groups to pre-download (e.g., `["web", "systems", "scripting"]`)."""
process: TreeSitterProcessConfig | None = None
"""Processing options for code analysis."""
@dataclass(frozen=True, slots=True)
class TreeSitterProcessConfig:
"""Processing options for tree-sitter code analysis."""
structure: bool = True
"""Extract structural items (functions, classes, structs, etc.). Default: true."""
imports: bool = True
"""Extract import statements. Default: true."""
exports: bool = True
"""Extract export statements. Default: true."""
comments: bool = False
"""Extract comments. Default: false."""
docstrings: bool = False
"""Extract docstrings. Default: false."""
symbols: bool = False
"""Extract symbol definitions. Default: false."""
diagnostics: bool = False
"""Include parse diagnostics. Default: false."""
chunk_max_size: int | None = None
"""Maximum chunk size in bytes. `None` disables chunking."""
content_mode: CodeContentMode | str = "chunks"
"""Content rendering mode for code extraction."""
@dataclass(frozen=True, slots=True)
class ServerConfig:
"""API server configuration."""
host: str = ""
"""Server host address (e.g., "127.0.0.1", "0.0.0.0")"""
port: int = 0
"""Server port number"""
cors_origins: list[str] = field(default_factory=list)
"""CORS allowed origins. Empty vector means allow all origins."""
max_request_body_bytes: int = 0
"""Maximum size of request body in bytes (default: 100 MB)"""
max_multipart_field_bytes: int = 0
"""Maximum size of multipart fields in bytes (default: 100 MB)"""
@dataclass(frozen=True, slots=True)
class DocxAppProperties:
"""Application properties from docProps/app.xml for DOCX."""
application: str | None = None
"""Application name (e.g., "Microsoft Office Word")"""
app_version: str | None = None
"""Application version"""
template: str | None = None
"""Template filename"""
total_time: int | None = None
"""Total editing time in minutes"""
pages: int | None = None
"""Number of pages"""
words: int | None = None
"""Number of words"""
characters: int | None = None
"""Number of characters (excluding spaces)"""
characters_with_spaces: int | None = None
"""Number of characters (including spaces)"""
lines: int | None = None
"""Number of lines"""
paragraphs: int | None = None
"""Number of paragraphs"""
company: str | None = None
"""Company name"""
doc_security: int | None = None
"""Document security level"""
scale_crop: bool | None = None
"""Scale crop flag"""
links_up_to_date: bool | None = None
"""Links up to date flag"""
shared_doc: bool | None = None
"""Shared document flag"""
hyperlinks_changed: bool | None = None
"""Hyperlinks changed flag"""
@dataclass(frozen=True, slots=True)
class XlsxAppProperties:
"""Application properties from docProps/app.xml for XLSX."""
application: str | None = None
"""Application name (e.g., "Microsoft Excel")"""
app_version: str | None = None
"""Application version"""
doc_security: int | None = None
"""Document security level"""
scale_crop: bool | None = None
"""Scale crop flag"""
links_up_to_date: bool | None = None
"""Links up to date flag"""
shared_doc: bool | None = None
"""Shared document flag"""
hyperlinks_changed: bool | None = None
"""Hyperlinks changed flag"""
company: str | None = None
"""Company name"""
worksheet_names: list[str] = field(default_factory=list)
"""Worksheet names"""
@dataclass(frozen=True, slots=True)
class PptxAppProperties:
"""Application properties from docProps/app.xml for PPTX."""
application: str | None = None
"""Application name (e.g., "Microsoft Office PowerPoint")"""
app_version: str | None = None
"""Application version"""
total_time: int | None = None
"""Total editing time in minutes"""
company: str | None = None
"""Company name"""
doc_security: int | None = None
"""Document security level"""
scale_crop: bool | None = None
"""Scale crop flag"""
links_up_to_date: bool | None = None
"""Links up to date flag"""
shared_doc: bool | None = None
"""Shared document flag"""
hyperlinks_changed: bool | None = None
"""Hyperlinks changed flag"""
slides: int | None = None
"""Number of slides"""
notes: int | None = None
"""Number of notes"""
hidden_slides: int | None = None
"""Number of hidden slides"""
multimedia_clips: int | None = None
"""Number of multimedia clips"""
presentation_format: str | None = None
"""Presentation format (e.g., "Widescreen", "Standard")"""
slide_titles: list[str] = field(default_factory=list)
"""Slide titles"""
@dataclass(frozen=True, slots=True)
class CoreProperties:
"""Dublin Core metadata from docProps/core.xml."""
title: str | None = None
"""Document title"""
subject: str | None = None
"""Document subject/topic"""
creator: str | None = None
"""Document creator/author"""
keywords: str | None = None
"""Keywords or tags"""
description: str | None = None
"""Document description/abstract"""
last_modified_by: str | None = None
"""User who last modified the document"""
revision: str | None = None
"""Revision number"""
created: str | None = None
"""Creation timestamp (ISO 8601)"""
modified: str | None = None
"""Last modification timestamp (ISO 8601)"""
category: str | None = None
"""Document category"""
content_status: str | None = None
"""Content status (Draft, Final, etc.)"""
language: str | None = None
"""Document language"""
identifier: str | None = None
"""Unique identifier"""
version: str | None = None
"""Document version"""
last_printed: str | None = None
"""Last print timestamp (ISO 8601)"""
@dataclass(frozen=True, slots=True)
class SecurityLimits:
"""Configuration for security limits across extractors."""
max_archive_size: int = 524288000
"""Maximum uncompressed size for archives (500 MB)"""
max_compression_ratio: int = 100
"""Maximum compression ratio before flagging as potential bomb (100:1)"""
max_files_in_archive: int = 10000
"""Maximum number of files in archive (10,000)"""
max_nesting_depth: int = 1024
"""Maximum nesting depth for structures (100)"""
max_entity_length: int = 1048576
"""Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token cap, NOT a total cap -- billion-laughs class attacks where a single entity expands to hundreds of MB are caught here, while normal long text content (a paragraph, a CDATA block) is caught by `max_content_size` instead."""
max_content_size: int = 104857600
"""Maximum string growth per document (100 MB)"""
max_iterations: int = 10000000
"""Maximum iterations per operation"""
max_xml_depth: int = 1024
"""Maximum XML depth (100 levels)"""
max_table_cells: int = 100000
"""Maximum cells per table (100,000)"""
@dataclass(frozen=True, slots=True)
class TokenReductionConfig:
"""Token reduction config."""
level: ReductionLevel | str = "moderate"
language_hint: str | None = None
preserve_markdown: bool = False
preserve_code: bool = True
semantic_threshold: float = 0.3
enable_parallel: bool = True
use_simd: bool = True
custom_stopwords: dict[str, list[str]] | None = None
preserve_patterns: list[str] = field(default_factory=list)
target_reduction: float | None = None
enable_semantic_clustering: bool = False
@dataclass(frozen=True, slots=True)
class DocumentStructure:
"""Top-level structured document representation."""
nodes: list[DocumentNode] = field(default_factory=list)
"""All nodes in document/reading order."""
source_format: str | None = None
"""Origin format identifier (e.g. "docx", "pptx", "html", "pdf")."""
relationships: list[DocumentRelationship] = field(default_factory=list)
"""Resolved relationships between nodes (footnote refs, citations, anchor links, etc.)."""
node_types: list[str] = field(default_factory=list)
"""Sorted, deduplicated list of node type names present in this document."""
@dataclass(frozen=True, slots=True)
class TableGrid:
"""Structured table grid with cell-level metadata."""
rows: int = 0
"""Number of rows in the table."""
cols: int = 0
"""Number of columns in the table."""
cells: list[GridCell] = field(default_factory=list)
"""All cells in row-major order."""
class ExtractionResult(TypedDict, total=False):
"""General extraction result used by the core extraction API."""
content: str
mime_type: str
metadata: Metadata
extraction_method: ExtractionMethod | str | None
"""Extraction strategy used to produce the returned text."""
tables: list[Table]
detected_languages: list[str] | None
chunks: list[Chunk] | None
"""Text chunks when chunking is enabled."""
images: list[ExtractedImage] | None
"""Extracted images from the document."""
pages: list[PageContent] | None
"""Per-page content when page extraction is enabled."""
elements: list[Element] | None
"""Semantic elements when element-based result format is enabled."""
djot_content: DjotContent | None
"""Rich Djot content structure (when extracting Djot documents)."""
ocr_elements: list[OcrElement] | None
"""OCR elements with full spatial and confidence metadata."""
document: DocumentStructure | None
"""Structured document tree (when document structure extraction is enabled)."""
extracted_keywords: list[Keyword] | None
"""Extracted keywords when keyword extraction is enabled."""
quality_score: float | None
"""Document quality score from quality analysis."""
processing_warnings: list[ProcessingWarning]
"""Non-fatal warnings collected during processing pipeline stages."""
annotations: list[PdfAnnotation] | None
"""PDF annotations extracted from the document."""
children: list[ArchiveEntry] | None
"""Nested extraction results from archive contents."""
uris: list[ExtractedUri] | None
"""URIs/links discovered during document extraction."""
revisions: list[DocumentRevision] | None
"""Tracked changes embedded in the source document."""
structured_output: str | None
"""Structured extraction output from LLM-based JSON schema extraction."""
code_intelligence: str | None
"""Code intelligence results from tree-sitter analysis."""
llm_usage: list[LlmUsage] | None
"""LLM token usage and cost data for all LLM calls made during this extraction."""
formatted_content: str | None
"""Pre-rendered content in the requested output format."""
ocr_internal_document: str | None
"""Structured hOCR document for the OCR+layout pipeline."""
@dataclass(frozen=True, slots=True)
class LlmUsage:
"""Token usage and cost data for a single LLM call made during extraction."""
model: str = ""
"""The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514")."""
source: str = ""
"""The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction", "embeddings")."""
input_tokens: int | None = None
"""Number of input/prompt tokens consumed."""
output_tokens: int | None = None
"""Number of output/completion tokens generated."""
total_tokens: int | None = None
"""Total tokens (input + output)."""
estimated_cost: float | None = None
"""Estimated cost in USD based on the provider's published pricing."""
finish_reason: str | None = None
"""Why the model stopped generating (e.g. "stop", "length", "content_filter")."""
@dataclass(frozen=True, slots=True)
class BoundingBox:
"""Bounding box coordinates for element positioning."""
x0: float = 0.0
"""Left x-coordinate"""
y0: float = 0.0
"""Bottom y-coordinate"""
x1: float = 0.0
"""Right x-coordinate"""
y1: float = 0.0
"""Top y-coordinate"""
@dataclass(frozen=True, slots=True)
class ImagePreprocessingConfig:
"""Image preprocessing configuration for OCR."""
target_dpi: int = 300
"""Target DPI for the image (300 is standard, 600 for small text)."""
auto_rotate: bool = True
"""Auto-detect and correct image rotation."""
deskew: bool = True
"""Correct skew (tilted images)."""
denoise: bool = False
"""Remove noise from the image."""
contrast_enhance: bool = False
"""Enhance contrast for better text visibility."""
binarization_method: str = "otsu"
"""Binarization method: "otsu", "sauvola", "adaptive"."""
invert_colors: bool = False
"""Invert colors (white text on black → black on white)."""
@dataclass(frozen=True, slots=True)
class TesseractConfig:
"""Tesseract OCR configuration."""
language: str = "eng"
"""Language code (e.g., "eng", "deu", "fra")"""
psm: int = 3
"""Page Segmentation Mode (0-13)."""
output_format: str = "markdown"
"""Output format ("text" or "markdown")"""
oem: int = 3
"""OCR Engine Mode (0-3)."""
min_confidence: float = 0
"""Minimum confidence threshold (0.0-100.0)."""
preprocessing: ImagePreprocessingConfig | None = None
"""Image preprocessing configuration."""
enable_table_detection: bool = True
"""Enable automatic table detection and reconstruction"""
table_min_confidence: float = 0
"""Minimum confidence threshold for table detection (0.0-1.0)"""
table_column_threshold: int = 50
"""Column threshold for table detection (pixels)"""
table_row_threshold_ratio: float = 0.5
"""Row threshold ratio for table detection (0.0-1.0)"""
use_cache: bool = True
"""Enable OCR result caching"""
classify_use_pre_adapted_templates: bool = True
"""Use pre-adapted templates for character classification"""
language_model_ngram_on: bool = False
"""Enable N-gram language model"""
tessedit_dont_blkrej_good_wds: bool = True
"""Don't reject good words during block-level processing"""
tessedit_dont_rowrej_good_wds: bool = True
"""Don't reject good words during row-level processing"""
tessedit_enable_dict_correction: bool = True
"""Enable dictionary correction"""
tessedit_char_whitelist: str = ""
"""Whitelist of allowed characters (empty = all allowed)"""
tessedit_char_blacklist: str = ""
"""Blacklist of forbidden characters (empty = none forbidden)"""
tessedit_use_primary_params_model: bool = True
"""Use primary language params model"""
textord_space_size_is_variable: bool = True
"""Variable-width space detection"""
thresholding_method: bool = False
"""Use adaptive thresholding method"""
@dataclass(frozen=True, slots=True)
class Metadata:
"""Extraction result metadata."""
title: str | None = None
"""Document title"""
subject: str | None = None
"""Document subject or description"""
authors: list[str] | None = None
"""Primary author(s) - always Vec for consistency"""
keywords: list[str] | None = None
"""Keywords/tags - always Vec for consistency"""
language: str | None = None
"""Primary language (ISO 639 code)"""
created_at: str | None = None
"""Creation timestamp (ISO 8601 format)"""
modified_at: str | None = None
"""Last modification timestamp (ISO 8601 format)"""
created_by: str | None = None
"""User who created the document"""
modified_by: str | None = None
"""User who last modified the document"""
pages: PageStructure | None = None
"""Page/slide/sheet structure with boundaries"""
format: FormatMetadata | None = None
"""Format-specific metadata (discriminated union)"""
image_preprocessing: ImagePreprocessingMetadata | None = None
"""Image preprocessing metadata (when OCR preprocessing was applied)"""
json_schema: str | None = None
"""JSON schema (for structured data extraction)"""
error: ErrorMetadata | None = None
"""Error metadata (for batch operations)"""
extraction_duration_ms: int | None = None
"""Extraction duration in milliseconds (for benchmarking)."""
category: str | None = None
"""Document category (from frontmatter or classification)."""
tags: list[str] | None = None
"""Document tags (from frontmatter)."""
document_version: str | None = None
"""Document version string (from frontmatter)."""
abstract_text: str | None = None
"""Abstract or summary text (from frontmatter)."""
output_format: str | None = None
"""Output format identifier (e.g., "markdown", "html", "text")."""
ocr_used: bool = False
"""Whether OCR was used during extraction."""
additional: dict[str, str] = field(default_factory=dict)
"""Additional custom fields from postprocessors."""
@dataclass(frozen=True, slots=True)
class ExcelMetadata:
"""Excel/spreadsheet format metadata."""
sheet_count: int | None = None
"""Number of sheets in the workbook."""
sheet_names: list[str] | None = None
"""Names of all sheets in the workbook."""
@dataclass(frozen=True, slots=True)
class EmailMetadata:
"""Email metadata extracted from .eml and .msg files."""
from_email: str | None = None
"""Sender's email address"""
from_name: str | None = None
"""Sender's display name"""
to_emails: list[str] = field(default_factory=list)
"""Primary recipients"""
cc_emails: list[str] = field(default_factory=list)
"""CC recipients"""
bcc_emails: list[str] = field(default_factory=list)
"""BCC recipients"""
message_id: str | None = None
"""Message-ID header value"""
attachments: list[str] = field(default_factory=list)
"""List of attachment filenames"""
@dataclass(frozen=True, slots=True)
class ArchiveMetadata:
"""Archive (ZIP/TAR/7Z) metadata."""
format: str = ""
"""Archive format ("ZIP", "TAR", "7Z", etc.)"""
file_count: int = 0
"""Total number of files in the archive"""
file_list: list[str] = field(default_factory=list)
"""List of file paths within the archive"""
total_size: int = 0
"""Total uncompressed size in bytes"""
compressed_size: int | None = None
"""Compressed size in bytes (if available)"""
@dataclass(frozen=True, slots=True)
class ImageMetadata:
"""Image metadata extracted from image files."""
width: int = 0
"""Image width in pixels"""
height: int = 0
"""Image height in pixels"""
format: str = ""
"""Image format (e.g., "PNG", "JPEG", "TIFF")"""
exif: dict[str, str] = field(default_factory=dict)
"""EXIF metadata tags"""
@dataclass(frozen=True, slots=True)
class XmlMetadata:
"""XML metadata extracted during XML parsing."""
element_count: int = 0
"""Total number of XML elements processed"""
unique_elements: list[str] = field(default_factory=list)
"""List of unique element tag names (sorted)"""
@dataclass(frozen=True, slots=True)
class TextMetadata:
"""Text/Markdown metadata."""
line_count: int = 0
"""Number of lines in the document"""
word_count: int = 0
"""Number of words"""
character_count: int = 0
"""Number of characters"""
headers: list[str] | None = None
"""Markdown headers (headings text only, for Markdown files)"""
links: list[list[str]] | None = None
"""Markdown links as (text, url) tuples (for Markdown files)"""
code_blocks: list[list[str]] | None = None
"""Code blocks as (language, code) tuples (for Markdown files)"""
@dataclass(frozen=True, slots=True)
class HtmlMetadata:
"""HTML metadata extracted from HTML documents."""
title: str | None = None
"""Document title from `<title>` tag"""
description: str | None = None
"""Document description from `<meta name="description">` tag"""
keywords: list[str] = field(default_factory=list)
"""Document keywords from `<meta name="keywords">` tag, split on commas"""
author: str | None = None
"""Document author from `<meta name="author">` tag"""
canonical_url: str | None = None
"""Canonical URL from `<link rel="canonical">` tag"""
base_href: str | None = None
"""Base URL from `<base href="">` tag for resolving relative URLs"""
language: str | None = None
"""Document language from `lang` attribute"""
text_direction: TextDirection | str | None = None
"""Document text direction from `dir` attribute"""
open_graph: dict[str, str] = field(default_factory=dict)
"""Open Graph metadata (og:* properties) for social media Keys like "title", "description", "image", "url", etc."""
twitter_card: dict[str, str] = field(default_factory=dict)
"""Twitter Card metadata (twitter:* properties) Keys like "card", "site", "creator", "title", "description", "image", etc."""
meta_tags: dict[str, str] = field(default_factory=dict)
"""Additional meta tags not covered by specific fields Keys are meta name/property attributes, values are content"""
headers: list[HeaderMetadata] = field(default_factory=list)
"""Extracted header elements with hierarchy"""
links: list[LinkMetadata] = field(default_factory=list)
"""Extracted hyperlinks with type classification"""
images: list[ImageMetadataType] = field(default_factory=list)
"""Extracted images with source and dimensions"""
structured_data: list[StructuredData] = field(default_factory=list)
"""Extracted structured data blocks"""
@dataclass(frozen=True, slots=True)
class OcrMetadata:
"""OCR processing metadata."""
language: str = ""
"""OCR language code(s) used"""
psm: int = 0
"""Tesseract Page Segmentation Mode (PSM)"""
output_format: str = ""
"""Output format (e.g., "text", "hocr")"""
table_count: int = 0
"""Number of tables detected"""
table_rows: int | None = None
table_cols: int | None = None
@dataclass(frozen=True, slots=True)
class PptxMetadata:
"""PowerPoint presentation metadata."""
slide_count: int = 0
"""Total number of slides in the presentation"""
slide_names: list[str] = field(default_factory=list)
"""Names of slides (if available)"""
image_count: int | None = None
"""Number of embedded images"""
table_count: int | None = None
"""Number of tables"""
@dataclass(frozen=True, slots=True)
class DocxMetadata:
"""Word document metadata."""
core_properties: CoreProperties | None = None
"""Core properties from docProps/core.xml (Dublin Core metadata)"""
app_properties: DocxAppProperties | None = None
"""Application properties from docProps/app.xml (Word-specific statistics)"""
custom_properties: dict[str, str] | None = None
"""Custom properties from docProps/custom.xml (user-defined properties)"""
@dataclass(frozen=True, slots=True)
class CsvMetadata:
"""CSV/TSV file metadata."""
row_count: int = 0
column_count: int = 0
delimiter: str | None = None
has_header: bool = False
column_types: list[str] | None = None
@dataclass(frozen=True, slots=True)
class BibtexMetadata:
"""BibTeX bibliography metadata."""
entry_count: int = 0
"""Number of entries in the bibliography."""
citation_keys: list[str] = field(default_factory=list)
authors: list[str] = field(default_factory=list)
year_range: YearRange | None = None
entry_types: dict[str, int] | None = None
@dataclass(frozen=True, slots=True)
class CitationMetadata:
"""Citation file metadata (RIS, PubMed, EndNote)."""
citation_count: int = 0
format: str | None = None
authors: list[str] = field(default_factory=list)
year_range: YearRange | None = None
dois: list[str] = field(default_factory=list)
keywords: list[str] = field(default_factory=list)
@dataclass(frozen=True, slots=True)
class FictionBookMetadata:
"""FictionBook (FB2) metadata."""
genres: list[str] = field(default_factory=list)
sequences: list[str] = field(default_factory=list)
annotation: str | None = None
@dataclass(frozen=True, slots=True)
class DbfMetadata:
"""dBASE (DBF) file metadata."""
record_count: int = 0
field_count: int = 0
fields: list[DbfFieldInfo] = field(default_factory=list)
@dataclass(frozen=True, slots=True)
class JatsMetadata:
"""JATS (Journal Article Tag Suite) metadata."""
copyright: str | None = None
license: str | None = None
history_dates: dict[str, str] = field(default_factory=dict)
contributor_roles: list[ContributorRole] = field(default_factory=list)
@dataclass(frozen=True, slots=True)
class EpubMetadata:
"""EPUB metadata (Dublin Core extensions)."""
coverage: str | None = None
dc_format: str | None = None
relation: str | None = None
source: str | None = None
dc_type: str | None = None
cover_image: str | None = None
@dataclass(frozen=True, slots=True)
class PstMetadata:
"""Outlook PST archive metadata."""
message_count: int = 0
@dataclass(frozen=True, slots=True)
class OcrConfidence:
"""Confidence scores for an OCR element."""
detection: float | None = None
"""Detection confidence: how confident the OCR engine is that text exists here."""
recognition: float = 0.0
"""Recognition confidence: how confident about the text content."""
@dataclass(frozen=True, slots=True)
class OcrElement:
"""A unified OCR element representing detected text with full metadata."""
text: str = ""
"""The recognized text content."""
geometry: OcrBoundingGeometry | None = None
"""Bounding geometry (rectangle or quadrilateral)."""
confidence: OcrConfidence | None = None
"""Confidence scores for detection and recognition."""
level: OcrElementLevel | str = "line"
"""Hierarchical level (word, line, block, page)."""
rotation: OcrRotation | None = None
"""Rotation information (if detected)."""
page_number: int = 0
"""Page number (1-indexed)."""
parent_id: str | None = None
"""Parent element ID for hierarchical relationships."""
backend_metadata: dict[str, str] = field(default_factory=dict)
"""Backend-specific metadata that doesn't fit the unified schema."""
@dataclass(frozen=True, slots=True)
class OcrElementConfig:
"""Configuration for OCR element extraction."""
include_elements: bool = False
"""Whether to include OCR elements in the extraction result."""
min_level: OcrElementLevel | str = "line"
"""Minimum hierarchical level to include."""
min_confidence: float = 0.0
"""Minimum recognition confidence threshold (0.0-1.0)."""
build_hierarchy: bool = False
"""Whether to build hierarchical relationships between elements."""
@dataclass(frozen=True, slots=True)
class LayoutRegion:
"""A detected layout region on a page."""
class_name: str = ""
"""Layout class name (e.g. "picture", "table", "text", "section_header")."""
confidence: float = 0.0
"""Confidence score from the layout detection model (0.0 to 1.0)."""
bounding_box: BoundingBox | None = None
"""Bounding box in document coordinate space."""
area_fraction: float = 0.0
"""Fraction of the page area covered by this region (0.0 to 1.0)."""
@dataclass(frozen=True, slots=True)
class RevisionDelta:
"""The content changes that make up a single revision."""
content: list[DiffLine] = field(default_factory=list)
"""Line-level content changes for this revision."""
table_changes: list[CellChange] = field(default_factory=list)
"""Cell-level table changes for this revision."""
@dataclass(frozen=True, slots=True)
class Table:
"""Extracted table structure."""
cells: list[list[str]] = field(default_factory=list)
"""Table cells as a 2D vector (rows x columns)"""
markdown: str = ""
"""Markdown representation of the table"""
page_number: int = 0
"""Page number where the table was found (1-indexed)"""
bounding_box: BoundingBox | None = None
"""Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top). Only populated for PDF-extracted tables when position data is available."""
@dataclass(frozen=True, slots=True)
class TableCell:
"""Individual table cell with content and optional styling."""
content: str = ""
"""Cell content as text"""
row_span: int = 0
"""Row span (number of rows this cell spans)"""
col_span: int = 0
"""Column span (number of columns this cell spans)"""
is_header: bool = False
"""Whether this is a header cell"""
@dataclass(frozen=True, slots=True)
class DiffOptions:
"""Options controlling how two `ExtractionResult` values are compared."""
include_metadata: bool = True
"""Include metadata changes in the diff. Default: `true`."""
include_embedded: bool = True
"""Include embedded-children changes in the diff. Default: `true`."""
max_content_chars: int | None = None
"""Truncate content to this many characters before diffing."""
@dataclass(frozen=True, slots=True)
class YakeParams:
"""YAKE-specific parameters."""
window_size: int = 2
"""Window size for co-occurrence analysis (default: 2)."""
@dataclass(frozen=True, slots=True)
class RakeParams:
"""RAKE-specific parameters."""
min_word_length: int = 1
"""Minimum word length to consider (default: 1)."""
max_words_per_phrase: int = 3
"""Maximum words in a keyword phrase (default: 3)."""
@dataclass(frozen=True, slots=True)
class KeywordConfig:
"""Keyword extraction configuration."""
algorithm: KeywordAlgorithm | str = "yake"
"""Algorithm to use for extraction."""
max_keywords: int = 10
"""Maximum number of keywords to extract (default: 10)."""
min_score: float = 0
"""Minimum score threshold (0.0-1.0, default: 0.0)."""
ngram_range: list[int] = field(default_factory=list)
"""N-gram range for keyword extraction (min, max)."""
language: str | None = None
"""Language code for stopword filtering (e.g., "en", "de", "fr")."""
yake_params: YakeParams | None = None
"""YAKE-specific tuning parameters."""
rake_params: RakeParams | None = None
"""RAKE-specific tuning parameters."""
@dataclass(frozen=True, slots=True)
class PaddleOcrConfig:
"""Configuration for PaddleOCR backend."""
language: str = ""
"""Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")"""
cache_dir: str | None = None
"""Optional custom cache directory for model files"""
use_angle_cls: bool = False
"""Enable angle classification for rotated text (default: false). Can misfire on short text regions, rotating crops incorrectly before recognition."""
enable_table_detection: bool = False
"""Enable table structure detection (default: false)"""
det_db_thresh: float = 0.0
"""Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more confident detections"""
det_db_box_thresh: float = 0.0
"""Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0"""
det_db_unclip_ratio: float = 0.0
"""Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of detected text regions"""
det_limit_side_len: int = 0
"""Maximum side length for detection image (default: 960) Larger images may be resized to this limit for faster inference"""
rec_batch_num: int = 0
"""Batch size for recognition inference (default: 6) Number of text regions to process simultaneously"""
padding: int = 0
"""Padding in pixels added around the image before detection (default: 10). Large values can include surrounding content like table gridlines."""
drop_score: float = 0.0
"""Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition confidence below this threshold are discarded. Matches PaddleOCR Python's `drop_score` parameter. Range: 0.0-1.0"""
model_tier: str = ""
"""Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents"""
@dataclass(frozen=True, slots=True)
class PdfMetadata:
"""PDF-specific metadata."""
pdf_version: str | None = None
"""PDF version (e.g., "1.7", "2.0")"""
producer: str | None = None
"""PDF producer (application that created the PDF)"""
is_encrypted: bool | None = None
"""Whether the PDF is encrypted/password-protected"""
width: int | None = None
"""First page width in points (1/72 inch)"""
height: int | None = None
"""First page height in points (1/72 inch)"""
page_count: int | None = None
"""Total number of pages in the PDF document"""
# Output format for extraction results.
OutputFormat = str
# How chunk size is measured.
ChunkSizing = str | str | None
# Embedding model types supported by Kreuzberg.
EmbeddingModelType = str | int | LlmConfig
# Format-specific metadata (discriminated union).
FormatMetadata = (
PdfMetadata | DocxMetadata | ExcelMetadata | EmailMetadata | PptxMetadata | ArchiveMetadata | ImageMetadata | XmlMetadata | TextMetadata | HtmlMetadata | OcrMetadata | CsvMetadata | BibtexMetadata | CitationMetadata | FictionBookMetadata | DbfMetadata | JatsMetadata | EpubMetadata | PstMetadata | str)
# Bounding geometry for an OCR element.
OcrBoundingGeometry = int | str
# A single line in a unified-diff hunk.
DiffLine = str