2672 lines
63 KiB
Python
2672 lines
63 KiB
Python
|
|
# This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
# To regenerate: alef generate
|
||
|
|
# To verify freshness: alef verify --exit-code
|
||
|
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
"""Configuration options for the conversion API."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from enum import Enum
|
||
|
|
from typing import TYPE_CHECKING, Any, TypedDict
|
||
|
|
|
||
|
|
from ._kreuzberg import (
|
||
|
|
ChunkerType, # noqa: TC001
|
||
|
|
CodeContentMode, # noqa: TC001
|
||
|
|
ExecutionProviderType, # noqa: TC001
|
||
|
|
ExtractionMethod, # noqa: TC001
|
||
|
|
HtmlTheme, # noqa: TC001
|
||
|
|
KeywordAlgorithm, # noqa: TC001
|
||
|
|
OcrElementLevel, # noqa: TC001
|
||
|
|
ReductionLevel, # noqa: TC001
|
||
|
|
ResultFormat, # noqa: TC001
|
||
|
|
TableModel, # noqa: TC001
|
||
|
|
TextDirection, # noqa: TC001
|
||
|
|
)
|
||
|
|
|
||
|
|
if TYPE_CHECKING:
|
||
|
|
from ._kreuzberg import (
|
||
|
|
ArchiveEntry, # noqa: F401
|
||
|
|
CellChange, # noqa: F401
|
||
|
|
Chunk, # noqa: F401
|
||
|
|
ContributorRole, # noqa: F401
|
||
|
|
DbfFieldInfo, # noqa: F401
|
||
|
|
DjotContent, # noqa: F401
|
||
|
|
DocumentNode, # noqa: F401
|
||
|
|
DocumentRelationship, # noqa: F401
|
||
|
|
DocumentRevision, # noqa: F401
|
||
|
|
Element, # noqa: F401
|
||
|
|
ErrorMetadata, # noqa: F401
|
||
|
|
ExtractedImage, # noqa: F401
|
||
|
|
ExtractedUri, # noqa: F401
|
||
|
|
GridCell, # noqa: F401
|
||
|
|
HeaderMetadata, # noqa: F401
|
||
|
|
ImageMetadataType, # noqa: F401
|
||
|
|
ImagePreprocessingMetadata, # noqa: F401
|
||
|
|
Keyword, # noqa: F401
|
||
|
|
LinkMetadata, # noqa: F401
|
||
|
|
OcrPipelineConfig, # noqa: F401
|
||
|
|
OcrRotation, # noqa: F401
|
||
|
|
PageContent, # noqa: F401
|
||
|
|
PageStructure, # noqa: F401
|
||
|
|
PdfAnnotation, # noqa: F401
|
||
|
|
ProcessingWarning, # noqa: F401
|
||
|
|
StructuredData, # noqa: F401
|
||
|
|
StructuredExtractionConfig, # noqa: F401
|
||
|
|
YearRange, # noqa: F401
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class ListType(str, Enum):
|
||
|
|
"""Type of list detection."""
|
||
|
|
|
||
|
|
bullet = "bullet"
|
||
|
|
|
||
|
|
numbered = "numbered"
|
||
|
|
|
||
|
|
lettered = "lettered"
|
||
|
|
|
||
|
|
indented = "indented"
|
||
|
|
|
||
|
|
|
||
|
|
class OcrBackendType(str, Enum):
|
||
|
|
"""OCR backend types."""
|
||
|
|
|
||
|
|
tesseract = "tesseract"
|
||
|
|
|
||
|
|
easy_ocr = "easy_ocr"
|
||
|
|
|
||
|
|
paddle_ocr = "paddle_ocr"
|
||
|
|
|
||
|
|
custom = "custom"
|
||
|
|
|
||
|
|
|
||
|
|
class ProcessingStage(str, Enum):
|
||
|
|
"""Processing stages for post-processors."""
|
||
|
|
|
||
|
|
early = "early"
|
||
|
|
|
||
|
|
middle = "middle"
|
||
|
|
|
||
|
|
late = "late"
|
||
|
|
|
||
|
|
|
||
|
|
class PdfAnnotationType(str, Enum):
|
||
|
|
"""Type of PDF annotation."""
|
||
|
|
|
||
|
|
text = "text"
|
||
|
|
|
||
|
|
highlight = "highlight"
|
||
|
|
|
||
|
|
link = "link"
|
||
|
|
|
||
|
|
stamp = "stamp"
|
||
|
|
|
||
|
|
underline = "underline"
|
||
|
|
|
||
|
|
strike_out = "strike_out"
|
||
|
|
|
||
|
|
other = "other"
|
||
|
|
|
||
|
|
|
||
|
|
class BlockType(str, Enum):
|
||
|
|
"""Types of block-level elements in Djot."""
|
||
|
|
|
||
|
|
paragraph = "paragraph"
|
||
|
|
|
||
|
|
heading = "heading"
|
||
|
|
|
||
|
|
blockquote = "blockquote"
|
||
|
|
|
||
|
|
code_block = "code_block"
|
||
|
|
|
||
|
|
list_item = "list_item"
|
||
|
|
|
||
|
|
ordered_list = "ordered_list"
|
||
|
|
|
||
|
|
bullet_list = "bullet_list"
|
||
|
|
|
||
|
|
task_list = "task_list"
|
||
|
|
|
||
|
|
definition_list = "definition_list"
|
||
|
|
|
||
|
|
definition_term = "definition_term"
|
||
|
|
|
||
|
|
definition_description = "definition_description"
|
||
|
|
|
||
|
|
div = "div"
|
||
|
|
|
||
|
|
section = "section"
|
||
|
|
|
||
|
|
thematic_break = "thematic_break"
|
||
|
|
|
||
|
|
raw_block = "raw_block"
|
||
|
|
|
||
|
|
math_display = "math_display"
|
||
|
|
|
||
|
|
|
||
|
|
class InlineType(str, Enum):
|
||
|
|
"""Types of inline elements in Djot."""
|
||
|
|
|
||
|
|
text = "text"
|
||
|
|
|
||
|
|
strong = "strong"
|
||
|
|
|
||
|
|
emphasis = "emphasis"
|
||
|
|
|
||
|
|
highlight = "highlight"
|
||
|
|
|
||
|
|
subscript = "subscript"
|
||
|
|
|
||
|
|
superscript = "superscript"
|
||
|
|
|
||
|
|
insert = "insert"
|
||
|
|
|
||
|
|
delete = "delete"
|
||
|
|
|
||
|
|
code = "code"
|
||
|
|
|
||
|
|
link = "link"
|
||
|
|
|
||
|
|
image = "image"
|
||
|
|
|
||
|
|
span = "span"
|
||
|
|
|
||
|
|
math = "math"
|
||
|
|
|
||
|
|
raw_inline = "raw_inline"
|
||
|
|
|
||
|
|
footnote_ref = "footnote_ref"
|
||
|
|
|
||
|
|
symbol = "symbol"
|
||
|
|
|
||
|
|
|
||
|
|
class RelationshipKind(str, Enum):
|
||
|
|
"""Semantic kind of a relationship between document elements."""
|
||
|
|
|
||
|
|
footnote_reference = "footnote_reference"
|
||
|
|
|
||
|
|
citation_reference = "citation_reference"
|
||
|
|
|
||
|
|
internal_link = "internal_link"
|
||
|
|
|
||
|
|
caption = "caption"
|
||
|
|
|
||
|
|
label = "label"
|
||
|
|
|
||
|
|
toc_entry = "toc_entry"
|
||
|
|
|
||
|
|
cross_reference = "cross_reference"
|
||
|
|
|
||
|
|
|
||
|
|
class ContentLayer(str, Enum):
|
||
|
|
"""Content layer classification for document nodes."""
|
||
|
|
|
||
|
|
body = "body"
|
||
|
|
|
||
|
|
header = "header"
|
||
|
|
|
||
|
|
footer = "footer"
|
||
|
|
|
||
|
|
footnote = "footnote"
|
||
|
|
|
||
|
|
|
||
|
|
class ChunkType(str, Enum):
|
||
|
|
"""Semantic structural classification of a text chunk."""
|
||
|
|
|
||
|
|
heading = "heading"
|
||
|
|
|
||
|
|
party_list = "party_list"
|
||
|
|
|
||
|
|
definitions = "definitions"
|
||
|
|
|
||
|
|
operative_clause = "operative_clause"
|
||
|
|
|
||
|
|
signature_block = "signature_block"
|
||
|
|
|
||
|
|
schedule = "schedule"
|
||
|
|
|
||
|
|
table_like = "table_like"
|
||
|
|
|
||
|
|
formula = "formula"
|
||
|
|
|
||
|
|
code_block = "code_block"
|
||
|
|
|
||
|
|
image = "image"
|
||
|
|
|
||
|
|
org_chart = "org_chart"
|
||
|
|
|
||
|
|
diagram = "diagram"
|
||
|
|
|
||
|
|
unknown = "unknown"
|
||
|
|
|
||
|
|
|
||
|
|
class ImageKind(str, Enum):
|
||
|
|
"""Heuristic classification of what an image likely depicts."""
|
||
|
|
|
||
|
|
photograph = "photograph"
|
||
|
|
|
||
|
|
diagram = "diagram"
|
||
|
|
|
||
|
|
chart = "chart"
|
||
|
|
|
||
|
|
drawing = "drawing"
|
||
|
|
|
||
|
|
text_block = "text_block"
|
||
|
|
|
||
|
|
decoration = "decoration"
|
||
|
|
|
||
|
|
logo = "logo"
|
||
|
|
|
||
|
|
icon = "icon"
|
||
|
|
|
||
|
|
tile_fragment = "tile_fragment"
|
||
|
|
|
||
|
|
mask = "mask"
|
||
|
|
|
||
|
|
page_raster = "page_raster"
|
||
|
|
|
||
|
|
unknown = "unknown"
|
||
|
|
|
||
|
|
|
||
|
|
class ElementType(str, Enum):
|
||
|
|
"""Semantic element type classification."""
|
||
|
|
|
||
|
|
title_ = "title"
|
||
|
|
|
||
|
|
narrative_text = "narrative_text"
|
||
|
|
|
||
|
|
heading = "heading"
|
||
|
|
|
||
|
|
list_item = "list_item"
|
||
|
|
|
||
|
|
table = "table"
|
||
|
|
|
||
|
|
image = "image"
|
||
|
|
|
||
|
|
page_break = "page_break"
|
||
|
|
|
||
|
|
code_block = "code_block"
|
||
|
|
|
||
|
|
block_quote = "block_quote"
|
||
|
|
|
||
|
|
footer = "footer"
|
||
|
|
|
||
|
|
header = "header"
|
||
|
|
|
||
|
|
|
||
|
|
class LinkType(str, Enum):
|
||
|
|
"""Link type classification."""
|
||
|
|
|
||
|
|
anchor = "anchor"
|
||
|
|
|
||
|
|
internal = "internal"
|
||
|
|
|
||
|
|
external = "external"
|
||
|
|
|
||
|
|
email = "email"
|
||
|
|
|
||
|
|
phone = "phone"
|
||
|
|
|
||
|
|
other = "other"
|
||
|
|
|
||
|
|
|
||
|
|
class ImageType(str, Enum):
|
||
|
|
"""Image type classification."""
|
||
|
|
|
||
|
|
data_uri = "data-uri"
|
||
|
|
|
||
|
|
inline_svg = "inline-svg"
|
||
|
|
|
||
|
|
external = "external"
|
||
|
|
|
||
|
|
relative = "relative"
|
||
|
|
|
||
|
|
|
||
|
|
class StructuredDataType(str, Enum):
|
||
|
|
"""Structured data type classification."""
|
||
|
|
|
||
|
|
json_ld = "json-ld"
|
||
|
|
|
||
|
|
microdata = "microdata"
|
||
|
|
|
||
|
|
rd_fa = "rdfa"
|
||
|
|
|
||
|
|
|
||
|
|
class PageUnitType(str, Enum):
|
||
|
|
"""Type of paginated unit in a document."""
|
||
|
|
|
||
|
|
page = "page"
|
||
|
|
|
||
|
|
slide = "slide"
|
||
|
|
|
||
|
|
sheet = "sheet"
|
||
|
|
|
||
|
|
|
||
|
|
class RevisionKind(str, Enum):
|
||
|
|
"""Semantic classification of a tracked change."""
|
||
|
|
|
||
|
|
insertion = "insertion"
|
||
|
|
|
||
|
|
deletion = "deletion"
|
||
|
|
|
||
|
|
format_change = "format_change"
|
||
|
|
|
||
|
|
comment = "comment"
|
||
|
|
|
||
|
|
|
||
|
|
class UriKind(str, Enum):
|
||
|
|
"""Semantic classification of an extracted URI."""
|
||
|
|
|
||
|
|
hyperlink = "hyperlink"
|
||
|
|
|
||
|
|
image = "image"
|
||
|
|
|
||
|
|
anchor = "anchor"
|
||
|
|
|
||
|
|
citation = "citation"
|
||
|
|
|
||
|
|
reference = "reference"
|
||
|
|
|
||
|
|
email = "email"
|
||
|
|
|
||
|
|
|
||
|
|
class PSMMode(str, Enum):
|
||
|
|
"""Page Segmentation Mode for Tesseract OCR."""
|
||
|
|
|
||
|
|
osd_only = "osd_only"
|
||
|
|
|
||
|
|
auto_osd = "auto_osd"
|
||
|
|
|
||
|
|
auto_only = "auto_only"
|
||
|
|
|
||
|
|
auto = "auto"
|
||
|
|
|
||
|
|
single_column = "single_column"
|
||
|
|
|
||
|
|
single_block_vertical = "single_block_vertical"
|
||
|
|
|
||
|
|
single_block = "single_block"
|
||
|
|
|
||
|
|
single_line = "single_line"
|
||
|
|
|
||
|
|
single_word = "single_word"
|
||
|
|
|
||
|
|
circle_word = "circle_word"
|
||
|
|
|
||
|
|
single_char = "single_char"
|
||
|
|
|
||
|
|
|
||
|
|
class PaddleLanguage(str, Enum):
|
||
|
|
"""Supported languages in PaddleOCR."""
|
||
|
|
|
||
|
|
english = "english"
|
||
|
|
|
||
|
|
chinese = "chinese"
|
||
|
|
|
||
|
|
japanese = "japanese"
|
||
|
|
|
||
|
|
korean = "korean"
|
||
|
|
|
||
|
|
german = "german"
|
||
|
|
|
||
|
|
french = "french"
|
||
|
|
|
||
|
|
latin = "latin"
|
||
|
|
|
||
|
|
cyrillic = "cyrillic"
|
||
|
|
|
||
|
|
traditional_chinese = "traditional_chinese"
|
||
|
|
|
||
|
|
thai = "thai"
|
||
|
|
|
||
|
|
greek = "greek"
|
||
|
|
|
||
|
|
east_slavic = "east_slavic"
|
||
|
|
|
||
|
|
arabic = "arabic"
|
||
|
|
|
||
|
|
devanagari = "devanagari"
|
||
|
|
|
||
|
|
tamil = "tamil"
|
||
|
|
|
||
|
|
telugu = "telugu"
|
||
|
|
|
||
|
|
|
||
|
|
class LayoutClass(str, Enum):
|
||
|
|
"""The 17 canonical document layout classes."""
|
||
|
|
|
||
|
|
caption = "caption"
|
||
|
|
|
||
|
|
footnote = "footnote"
|
||
|
|
|
||
|
|
formula = "formula"
|
||
|
|
|
||
|
|
list_item = "list_item"
|
||
|
|
|
||
|
|
page_footer = "page_footer"
|
||
|
|
|
||
|
|
page_header = "page_header"
|
||
|
|
|
||
|
|
picture = "picture"
|
||
|
|
|
||
|
|
section_header = "section_header"
|
||
|
|
|
||
|
|
table = "table"
|
||
|
|
|
||
|
|
text = "text"
|
||
|
|
|
||
|
|
title_ = "title"
|
||
|
|
|
||
|
|
document_index = "document_index"
|
||
|
|
|
||
|
|
code = "code"
|
||
|
|
|
||
|
|
checkbox_selected = "checkbox_selected"
|
||
|
|
|
||
|
|
checkbox_unselected = "checkbox_unselected"
|
||
|
|
|
||
|
|
form = "form"
|
||
|
|
|
||
|
|
key_value_region = "key_value_region"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class AccelerationConfig:
|
||
|
|
"""Hardware acceleration configuration for ONNX Runtime models."""
|
||
|
|
|
||
|
|
provider: ExecutionProviderType | str = "auto"
|
||
|
|
|
||
|
|
"""Execution provider to use for ONNX inference."""
|
||
|
|
|
||
|
|
device_id: int = 0
|
||
|
|
|
||
|
|
"""GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ContentFilterConfig:
|
||
|
|
"""Cross-extractor content filtering configuration."""
|
||
|
|
|
||
|
|
include_headers: bool = False
|
||
|
|
|
||
|
|
"""Include running headers in extraction output."""
|
||
|
|
|
||
|
|
include_footers: bool = False
|
||
|
|
|
||
|
|
"""Include running footers in extraction output."""
|
||
|
|
|
||
|
|
strip_repeating_text: bool = True
|
||
|
|
|
||
|
|
"""Enable the heuristic cross-page repeating text detector."""
|
||
|
|
|
||
|
|
include_watermarks: bool = False
|
||
|
|
|
||
|
|
"""Include watermark text in extraction output."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class EmailConfig:
|
||
|
|
"""Configuration for email extraction."""
|
||
|
|
|
||
|
|
msg_fallback_codepage: int | None = None
|
||
|
|
|
||
|
|
"""Windows codepage number to use when an MSG file contains no codepage property. Defaults to `None`, which falls back to windows-1252."""
|
||
|
|
|
||
|
|
|
||
|
|
class ExtractionConfig(TypedDict, total=False):
|
||
|
|
"""Main extraction configuration."""
|
||
|
|
use_cache: bool
|
||
|
|
|
||
|
|
"""Enable caching of extraction results"""
|
||
|
|
enable_quality_processing: bool
|
||
|
|
|
||
|
|
"""Enable quality post-processing"""
|
||
|
|
ocr: OcrConfig | None
|
||
|
|
|
||
|
|
"""OCR configuration (None = OCR disabled)"""
|
||
|
|
force_ocr: bool
|
||
|
|
|
||
|
|
"""Force OCR even for searchable PDFs"""
|
||
|
|
force_ocr_pages: list[int] | None
|
||
|
|
|
||
|
|
"""Force OCR on specific pages only (1-indexed page numbers, must be >= 1)."""
|
||
|
|
disable_ocr: bool
|
||
|
|
|
||
|
|
"""Disable OCR entirely, even for images."""
|
||
|
|
chunking: ChunkingConfig | None
|
||
|
|
|
||
|
|
"""Text chunking configuration (None = chunking disabled)"""
|
||
|
|
content_filter: ContentFilterConfig | None
|
||
|
|
|
||
|
|
"""Content filtering configuration (None = use extractor defaults)."""
|
||
|
|
images: ImageExtractionConfig | None
|
||
|
|
|
||
|
|
"""Image extraction configuration (None = no image extraction)"""
|
||
|
|
pdf_options: PdfConfig | None
|
||
|
|
|
||
|
|
"""PDF-specific options (None = use defaults)"""
|
||
|
|
token_reduction: TokenReductionOptions | None
|
||
|
|
|
||
|
|
"""Token reduction configuration (None = no token reduction)"""
|
||
|
|
language_detection: LanguageDetectionConfig | None
|
||
|
|
|
||
|
|
"""Language detection configuration (None = no language detection)"""
|
||
|
|
pages: PageConfig | None
|
||
|
|
|
||
|
|
"""Page extraction configuration (None = no page tracking)"""
|
||
|
|
keywords: KeywordConfig | None
|
||
|
|
|
||
|
|
"""Keyword extraction configuration (None = no keyword extraction)"""
|
||
|
|
postprocessor: PostProcessorConfig | None
|
||
|
|
|
||
|
|
"""Post-processor configuration (None = use defaults)"""
|
||
|
|
html_options: str | None
|
||
|
|
|
||
|
|
"""HTML to Markdown conversion options (None = use defaults)"""
|
||
|
|
html_output: HtmlOutputConfig | None
|
||
|
|
|
||
|
|
"""Styled HTML output configuration."""
|
||
|
|
extraction_timeout_secs: int | None
|
||
|
|
|
||
|
|
"""Default per-file timeout in seconds for batch extraction."""
|
||
|
|
max_concurrent_extractions: int | None
|
||
|
|
|
||
|
|
"""Maximum concurrent extractions in batch operations (None = (num_cpus x 1.5).ceil())."""
|
||
|
|
result_format: ResultFormat | str
|
||
|
|
|
||
|
|
"""Result structure format"""
|
||
|
|
security_limits: SecurityLimits | None
|
||
|
|
|
||
|
|
"""Security limits for archive extraction."""
|
||
|
|
max_embedded_file_bytes: int | None
|
||
|
|
|
||
|
|
"""Maximum uncompressed size in bytes for a single embedded file before recursive extraction is attempted (default: 50 MiB)."""
|
||
|
|
output_format: OutputFormat
|
||
|
|
|
||
|
|
"""Content text format (default: Plain)."""
|
||
|
|
layout: LayoutDetectionConfig | None
|
||
|
|
|
||
|
|
"""Layout detection configuration (None = layout detection disabled)."""
|
||
|
|
use_layout_for_markdown: bool
|
||
|
|
|
||
|
|
"""Run layout detection on the non-OCR PDF markdown path."""
|
||
|
|
include_document_structure: bool
|
||
|
|
|
||
|
|
"""Enable structured document tree output."""
|
||
|
|
acceleration: AccelerationConfig | None
|
||
|
|
|
||
|
|
"""Hardware acceleration configuration for ONNX Runtime models."""
|
||
|
|
cache_namespace: str | None
|
||
|
|
|
||
|
|
"""Cache namespace for tenant isolation."""
|
||
|
|
cache_ttl_secs: int | None
|
||
|
|
|
||
|
|
"""Per-request cache TTL in seconds."""
|
||
|
|
email: EmailConfig | None
|
||
|
|
|
||
|
|
"""Email extraction configuration (None = use defaults)."""
|
||
|
|
concurrency: str | None
|
||
|
|
|
||
|
|
"""Concurrency limits for constrained environments (None = use defaults)."""
|
||
|
|
max_archive_depth: int
|
||
|
|
|
||
|
|
"""Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive extraction (legacy behavior)."""
|
||
|
|
tree_sitter: TreeSitterConfig | None
|
||
|
|
|
||
|
|
"""Tree-sitter language pack configuration (None = tree-sitter disabled)."""
|
||
|
|
structured_extraction: StructuredExtractionConfig | None
|
||
|
|
|
||
|
|
"""Structured extraction via LLM (None = disabled)."""
|
||
|
|
cancel_token: str | None
|
||
|
|
|
||
|
|
"""Cancellation token for this extraction (None = no external cancellation)."""
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class FileExtractionConfig:
|
||
|
|
"""Per-file extraction configuration overrides for batch processing."""
|
||
|
|
|
||
|
|
enable_quality_processing: bool | None = None
|
||
|
|
|
||
|
|
"""Override quality post-processing for this file."""
|
||
|
|
|
||
|
|
ocr: OcrConfig | None = None
|
||
|
|
|
||
|
|
"""Override OCR configuration for this file (None in the Option = use batch default)."""
|
||
|
|
|
||
|
|
force_ocr: bool | None = None
|
||
|
|
|
||
|
|
"""Override force OCR for this file."""
|
||
|
|
|
||
|
|
force_ocr_pages: list[int] | None = None
|
||
|
|
|
||
|
|
"""Override force OCR pages for this file (1-indexed page numbers)."""
|
||
|
|
|
||
|
|
disable_ocr: bool | None = None
|
||
|
|
|
||
|
|
"""Override disable OCR for this file."""
|
||
|
|
|
||
|
|
chunking: ChunkingConfig | None = None
|
||
|
|
|
||
|
|
"""Override chunking configuration for this file."""
|
||
|
|
|
||
|
|
content_filter: ContentFilterConfig | None = None
|
||
|
|
|
||
|
|
"""Override content filtering configuration for this file."""
|
||
|
|
|
||
|
|
images: ImageExtractionConfig | None = None
|
||
|
|
|
||
|
|
"""Override image extraction configuration for this file."""
|
||
|
|
|
||
|
|
pdf_options: PdfConfig | None = None
|
||
|
|
|
||
|
|
"""Override PDF options for this file."""
|
||
|
|
|
||
|
|
token_reduction: TokenReductionOptions | None = None
|
||
|
|
|
||
|
|
"""Override token reduction for this file."""
|
||
|
|
|
||
|
|
language_detection: LanguageDetectionConfig | None = None
|
||
|
|
|
||
|
|
"""Override language detection for this file."""
|
||
|
|
|
||
|
|
pages: PageConfig | None = None
|
||
|
|
|
||
|
|
"""Override page extraction for this file."""
|
||
|
|
|
||
|
|
keywords: KeywordConfig | None = None
|
||
|
|
|
||
|
|
"""Override keyword extraction for this file."""
|
||
|
|
|
||
|
|
postprocessor: PostProcessorConfig | None = None
|
||
|
|
|
||
|
|
"""Override post-processor for this file."""
|
||
|
|
|
||
|
|
html_options: str | None = None
|
||
|
|
|
||
|
|
"""Override HTML conversion options for this file."""
|
||
|
|
|
||
|
|
result_format: ResultFormat | str | None = None
|
||
|
|
|
||
|
|
"""Override result format for this file."""
|
||
|
|
|
||
|
|
output_format: OutputFormat | None = None
|
||
|
|
|
||
|
|
"""Override output content format for this file."""
|
||
|
|
|
||
|
|
include_document_structure: bool | None = None
|
||
|
|
|
||
|
|
"""Override document structure output for this file."""
|
||
|
|
|
||
|
|
layout: LayoutDetectionConfig | None = None
|
||
|
|
|
||
|
|
"""Override layout detection for this file."""
|
||
|
|
|
||
|
|
timeout_secs: int | None = None
|
||
|
|
|
||
|
|
"""Override per-file extraction timeout in seconds."""
|
||
|
|
|
||
|
|
tree_sitter: TreeSitterConfig | None = None
|
||
|
|
|
||
|
|
"""Override tree-sitter configuration for this file."""
|
||
|
|
|
||
|
|
structured_extraction: StructuredExtractionConfig | None = None
|
||
|
|
|
||
|
|
"""Override structured extraction configuration for this file."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ImageExtractionConfig:
|
||
|
|
"""Image extraction configuration."""
|
||
|
|
|
||
|
|
extract_images: bool = True
|
||
|
|
|
||
|
|
"""Extract images from documents"""
|
||
|
|
|
||
|
|
target_dpi: int = 300
|
||
|
|
|
||
|
|
"""Target DPI for image normalization"""
|
||
|
|
|
||
|
|
max_image_dimension: int = 4096
|
||
|
|
|
||
|
|
"""Maximum dimension for images (width or height)"""
|
||
|
|
|
||
|
|
inject_placeholders: bool = True
|
||
|
|
|
||
|
|
"""Whether to inject image reference placeholders into markdown output. When `true` (default), image references like `` are appended to the markdown. Set to `false` to extract images as data without polluting the markdown output."""
|
||
|
|
|
||
|
|
auto_adjust_dpi: bool = True
|
||
|
|
|
||
|
|
"""Automatically adjust DPI based on image content"""
|
||
|
|
|
||
|
|
min_dpi: int = 72
|
||
|
|
|
||
|
|
"""Minimum DPI threshold"""
|
||
|
|
|
||
|
|
max_dpi: int = 600
|
||
|
|
|
||
|
|
"""Maximum DPI threshold"""
|
||
|
|
|
||
|
|
max_images_per_page: int | None = None
|
||
|
|
|
||
|
|
"""Maximum number of image objects to extract per PDF page."""
|
||
|
|
|
||
|
|
classify: bool = True
|
||
|
|
|
||
|
|
"""When `true` (default), extracted images are classified by kind and grouped into clusters where they appear to belong to one figure."""
|
||
|
|
|
||
|
|
include_page_rasters: bool = False
|
||
|
|
|
||
|
|
"""When `true`, full-page renders produced during OCR preprocessing are captured and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`."""
|
||
|
|
|
||
|
|
run_ocr_on_images: bool = True
|
||
|
|
|
||
|
|
"""Run OCR on extracted images and include the recognized text in the document content."""
|
||
|
|
|
||
|
|
ocr_text_only: bool = False
|
||
|
|
|
||
|
|
"""When `true`, image OCR results are rendered as plain text without the `` markdown placeholder. Only takes effect when `run_ocr_on_images` is also `true`."""
|
||
|
|
|
||
|
|
append_ocr_text: bool = False
|
||
|
|
|
||
|
|
"""When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder in the rendered output."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TokenReductionOptions:
|
||
|
|
"""Token reduction configuration."""
|
||
|
|
|
||
|
|
mode: str = ""
|
||
|
|
|
||
|
|
"""Reduction mode: "off", "light", "moderate", "aggressive", "maximum" """
|
||
|
|
|
||
|
|
preserve_important_words: bool = True
|
||
|
|
|
||
|
|
"""Preserve important words (capitalized, technical terms)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class LanguageDetectionConfig:
|
||
|
|
"""Language detection configuration."""
|
||
|
|
|
||
|
|
enabled: bool = True
|
||
|
|
|
||
|
|
"""Enable language detection"""
|
||
|
|
|
||
|
|
min_confidence: float = 0.8
|
||
|
|
|
||
|
|
"""Minimum confidence threshold (0.0-1.0)"""
|
||
|
|
|
||
|
|
detect_multiple: bool = False
|
||
|
|
|
||
|
|
"""Detect multiple languages in the document"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class HtmlOutputConfig:
|
||
|
|
"""Configuration for styled HTML output."""
|
||
|
|
|
||
|
|
css: str | None = None
|
||
|
|
|
||
|
|
"""Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file` content when both are set."""
|
||
|
|
|
||
|
|
css_file: str | None = None
|
||
|
|
|
||
|
|
"""Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both are set."""
|
||
|
|
|
||
|
|
theme: HtmlTheme | str = "unstyled"
|
||
|
|
|
||
|
|
"""Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]."""
|
||
|
|
|
||
|
|
class_prefix: str = ""
|
||
|
|
|
||
|
|
"""CSS class prefix applied to every emitted class name."""
|
||
|
|
|
||
|
|
embed_css: bool = True
|
||
|
|
|
||
|
|
"""When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening `<div class="{prefix}doc">`."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class LayoutDetectionConfig:
|
||
|
|
"""Layout detection configuration."""
|
||
|
|
|
||
|
|
confidence_threshold: float | None = None
|
||
|
|
|
||
|
|
"""Confidence threshold override (None = use model default)."""
|
||
|
|
|
||
|
|
apply_heuristics: bool = True
|
||
|
|
|
||
|
|
"""Whether to apply postprocessing heuristics (default: true)."""
|
||
|
|
|
||
|
|
table_model: TableModel | str = "tatr"
|
||
|
|
|
||
|
|
"""Table structure recognition model."""
|
||
|
|
|
||
|
|
acceleration: AccelerationConfig | None = None
|
||
|
|
|
||
|
|
"""Hardware acceleration for ONNX models (layout detection + table structure)."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class LlmConfig:
|
||
|
|
"""Configuration for an LLM provider/model via liter-llm."""
|
||
|
|
|
||
|
|
model: str = ""
|
||
|
|
|
||
|
|
"""Provider/model string using liter-llm routing format."""
|
||
|
|
|
||
|
|
api_key: str | None = None
|
||
|
|
|
||
|
|
"""API key for the provider. When `None`, liter-llm falls back to the provider's standard environment variable (e.g., `OPENAI_API_KEY`)."""
|
||
|
|
|
||
|
|
base_url: str | None = None
|
||
|
|
|
||
|
|
"""Custom base URL override for the provider endpoint."""
|
||
|
|
|
||
|
|
timeout_secs: int | None = None
|
||
|
|
|
||
|
|
"""Request timeout in seconds (default: 60)."""
|
||
|
|
|
||
|
|
max_retries: int | None = None
|
||
|
|
|
||
|
|
"""Maximum retry attempts (default: 3)."""
|
||
|
|
|
||
|
|
temperature: float | None = None
|
||
|
|
|
||
|
|
"""Sampling temperature for generation tasks."""
|
||
|
|
|
||
|
|
max_tokens: int | None = None
|
||
|
|
|
||
|
|
"""Maximum tokens to generate."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrQualityThresholds:
|
||
|
|
"""Quality thresholds for OCR fallback decisions and pipeline quality gating."""
|
||
|
|
|
||
|
|
min_total_non_whitespace: int = 64
|
||
|
|
|
||
|
|
"""Minimum total non-whitespace characters to consider text substantive."""
|
||
|
|
|
||
|
|
min_non_whitespace_per_page: float = 32
|
||
|
|
|
||
|
|
"""Minimum non-whitespace characters per page on average."""
|
||
|
|
|
||
|
|
min_meaningful_word_len: int = 4
|
||
|
|
|
||
|
|
"""Minimum character count for a word to be "meaningful"."""
|
||
|
|
|
||
|
|
min_meaningful_words: int = 3
|
||
|
|
|
||
|
|
"""Minimum count of meaningful words before text is accepted."""
|
||
|
|
|
||
|
|
min_alnum_ratio: float = 0.3
|
||
|
|
|
||
|
|
"""Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric)."""
|
||
|
|
|
||
|
|
min_garbage_chars: int = 5
|
||
|
|
|
||
|
|
"""Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback."""
|
||
|
|
|
||
|
|
max_fragmented_word_ratio: float = 0.6
|
||
|
|
|
||
|
|
"""Maximum fraction of short (1-2 char) words before text is considered fragmented."""
|
||
|
|
|
||
|
|
critical_fragmented_word_ratio: float = 0.8
|
||
|
|
|
||
|
|
"""Critical fragmentation threshold -- triggers OCR regardless of meaningful words. Normal English text has ~20-30% short words. 80%+ is definitive garbage."""
|
||
|
|
|
||
|
|
min_avg_word_length: float = 2
|
||
|
|
|
||
|
|
"""Minimum average word length. Below this with enough words indicates garbled extraction."""
|
||
|
|
|
||
|
|
min_words_for_avg_length_check: int = 50
|
||
|
|
|
||
|
|
"""Minimum word count before average word length check applies."""
|
||
|
|
|
||
|
|
min_consecutive_repeat_ratio: float = 0.08
|
||
|
|
|
||
|
|
"""Minimum consecutive word repetition ratio to detect column scrambling."""
|
||
|
|
|
||
|
|
min_words_for_repeat_check: int = 50
|
||
|
|
|
||
|
|
"""Minimum word count before consecutive repetition check is applied."""
|
||
|
|
|
||
|
|
substantive_min_chars: int = 100
|
||
|
|
|
||
|
|
"""Minimum character count for "substantive markdown" OCR skip gate."""
|
||
|
|
|
||
|
|
non_text_min_chars: int = 20
|
||
|
|
|
||
|
|
"""Minimum character count for "non-text content" OCR skip gate."""
|
||
|
|
|
||
|
|
alnum_ws_ratio_threshold: float = 0.4
|
||
|
|
|
||
|
|
"""Alphanumeric+whitespace ratio threshold for skip decisions."""
|
||
|
|
|
||
|
|
pipeline_min_quality: float = 0.5
|
||
|
|
|
||
|
|
"""Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the result from a backend scores below this, try the next backend."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrConfig:
|
||
|
|
"""OCR configuration."""
|
||
|
|
|
||
|
|
enabled: bool = True
|
||
|
|
|
||
|
|
"""Whether OCR is enabled."""
|
||
|
|
|
||
|
|
backend: str = ""
|
||
|
|
|
||
|
|
"""OCR backend: tesseract, easyocr, paddleocr"""
|
||
|
|
|
||
|
|
language: str = ""
|
||
|
|
|
||
|
|
"""Language code (e.g., "eng", "deu")"""
|
||
|
|
|
||
|
|
tesseract_config: TesseractConfig | None = None
|
||
|
|
|
||
|
|
"""Tesseract-specific configuration (optional)"""
|
||
|
|
|
||
|
|
output_format: OutputFormat | None = None
|
||
|
|
|
||
|
|
"""Output format for OCR results (optional, for format conversion)"""
|
||
|
|
|
||
|
|
paddle_ocr_config: str | None = None
|
||
|
|
|
||
|
|
"""PaddleOCR-specific configuration (optional, JSON passthrough)"""
|
||
|
|
|
||
|
|
backend_options: str | None = None
|
||
|
|
|
||
|
|
"""Arbitrary per-call options passed through to the backend unchanged."""
|
||
|
|
|
||
|
|
element_config: OcrElementConfig | None = None
|
||
|
|
|
||
|
|
"""OCR element extraction configuration"""
|
||
|
|
|
||
|
|
quality_thresholds: OcrQualityThresholds | None = None
|
||
|
|
|
||
|
|
"""Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled defaults (matching previous hardcoded behavior)."""
|
||
|
|
|
||
|
|
pipeline: OcrPipelineConfig | None = None
|
||
|
|
|
||
|
|
"""Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR backends based on output quality. When None, uses the single `backend` field (same as today)."""
|
||
|
|
|
||
|
|
auto_rotate: bool = False
|
||
|
|
|
||
|
|
"""Enable automatic page rotation based on orientation detection."""
|
||
|
|
|
||
|
|
vlm_config: LlmConfig | None = None
|
||
|
|
|
||
|
|
"""VLM (Vision Language Model) OCR configuration."""
|
||
|
|
|
||
|
|
vlm_prompt: str | None = None
|
||
|
|
|
||
|
|
"""Custom Jinja2 prompt template for VLM OCR."""
|
||
|
|
|
||
|
|
acceleration: AccelerationConfig | None = None
|
||
|
|
|
||
|
|
"""Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection)."""
|
||
|
|
|
||
|
|
tessdata_bytes: dict[str, bytes] | None = None
|
||
|
|
|
||
|
|
"""Caller-supplied Tesseract `traineddata` bytes per language code."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PageConfig:
|
||
|
|
"""Page extraction and tracking configuration."""
|
||
|
|
|
||
|
|
extract_pages: bool = False
|
||
|
|
|
||
|
|
"""Extract pages as separate array (ExtractionResult.pages)"""
|
||
|
|
|
||
|
|
insert_page_markers: bool = False
|
||
|
|
|
||
|
|
"""Insert page markers in main content string"""
|
||
|
|
|
||
|
|
marker_format: str = "\n\n<!-- PAGE {page_num} -->\n\n"
|
||
|
|
|
||
|
|
"""Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n" """
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PdfConfig:
|
||
|
|
"""PDF-specific configuration."""
|
||
|
|
|
||
|
|
extract_images: bool = False
|
||
|
|
|
||
|
|
"""Extract images from PDF"""
|
||
|
|
|
||
|
|
extract_tables: bool = True
|
||
|
|
|
||
|
|
"""Extract tables from PDF."""
|
||
|
|
|
||
|
|
passwords: list[str] | None = None
|
||
|
|
|
||
|
|
"""List of passwords to try when opening encrypted PDFs"""
|
||
|
|
|
||
|
|
extract_metadata: bool = True
|
||
|
|
|
||
|
|
"""Extract PDF metadata"""
|
||
|
|
|
||
|
|
hierarchy: HierarchyConfig | None = None
|
||
|
|
|
||
|
|
"""Hierarchy extraction configuration (None = hierarchy extraction disabled)"""
|
||
|
|
|
||
|
|
extract_annotations: bool = False
|
||
|
|
|
||
|
|
"""Extract PDF annotations (text notes, highlights, links, stamps). Default: false"""
|
||
|
|
|
||
|
|
top_margin_fraction: float | None = None
|
||
|
|
|
||
|
|
"""Top margin fraction (0.0-1.0) of page height to exclude headers/running heads. Default: 0.06 (6%)"""
|
||
|
|
|
||
|
|
bottom_margin_fraction: float | None = None
|
||
|
|
|
||
|
|
"""Bottom margin fraction (0.0-1.0) of page height to exclude footers/page numbers. Default: 0.05 (5%)"""
|
||
|
|
|
||
|
|
allow_single_column_tables: bool = False
|
||
|
|
|
||
|
|
"""Allow single-column pseudo tables in extraction results."""
|
||
|
|
|
||
|
|
ocr_inline_images: bool = False
|
||
|
|
|
||
|
|
"""Perform OCR on inline images extracted from PDF pages and attach the recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract to be available; if `ExtractionConfig.ocr` is `None` the extractor falls back to `TesseractConfig::default()`. Per-image failures degrade gracefully (the image is returned without OCR text rather than failing the whole extraction). Default: `false`."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class HierarchyConfig:
|
||
|
|
"""Hierarchy extraction configuration for PDF text structure analysis."""
|
||
|
|
|
||
|
|
enabled: bool = True
|
||
|
|
|
||
|
|
"""Enable hierarchy extraction"""
|
||
|
|
|
||
|
|
k_clusters: int = 3
|
||
|
|
|
||
|
|
"""Number of font size clusters to use for hierarchy levels (1-7)"""
|
||
|
|
|
||
|
|
include_bbox: bool = True
|
||
|
|
|
||
|
|
"""Include bounding box information in hierarchy blocks"""
|
||
|
|
|
||
|
|
ocr_coverage_threshold: float | None = None
|
||
|
|
|
||
|
|
"""OCR coverage threshold for smart OCR triggering (0.0-1.0)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PostProcessorConfig:
|
||
|
|
"""Post-processor configuration."""
|
||
|
|
|
||
|
|
enabled: bool = True
|
||
|
|
|
||
|
|
"""Enable post-processors"""
|
||
|
|
|
||
|
|
enabled_processors: list[str] | None = None
|
||
|
|
|
||
|
|
"""Whitelist of processor names to run (None = all enabled)"""
|
||
|
|
|
||
|
|
disabled_processors: list[str] | None = None
|
||
|
|
|
||
|
|
"""Blacklist of processor names to skip (None = none disabled)"""
|
||
|
|
|
||
|
|
enabled_set: list[str] | None = None
|
||
|
|
|
||
|
|
"""Pre-computed AHashSet for O(1) enabled processor lookup"""
|
||
|
|
|
||
|
|
disabled_set: list[str] | None = None
|
||
|
|
|
||
|
|
"""Pre-computed AHashSet for O(1) disabled processor lookup"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ChunkingConfig:
|
||
|
|
"""Chunking configuration."""
|
||
|
|
|
||
|
|
max_characters: int = 1000
|
||
|
|
|
||
|
|
"""Maximum size per chunk (in units determined by `sizing`)."""
|
||
|
|
|
||
|
|
overlap: int = 200
|
||
|
|
|
||
|
|
"""Overlap between chunks (in units determined by `sizing`)."""
|
||
|
|
|
||
|
|
trim: bool = True
|
||
|
|
|
||
|
|
"""Whether to trim whitespace from chunk boundaries."""
|
||
|
|
|
||
|
|
chunker_type: ChunkerType | str = "text"
|
||
|
|
|
||
|
|
"""Type of chunker to use (Text or Markdown)."""
|
||
|
|
|
||
|
|
embedding: EmbeddingConfig | None = None
|
||
|
|
|
||
|
|
"""Optional embedding configuration for chunk embeddings."""
|
||
|
|
|
||
|
|
preset: str | None = None
|
||
|
|
|
||
|
|
"""Use a preset configuration (overrides individual settings if provided)."""
|
||
|
|
|
||
|
|
sizing: ChunkSizing | None = None
|
||
|
|
|
||
|
|
"""How to measure chunk size."""
|
||
|
|
|
||
|
|
prepend_heading_context: bool = False
|
||
|
|
|
||
|
|
"""When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string."""
|
||
|
|
|
||
|
|
topic_threshold: float | None = None
|
||
|
|
|
||
|
|
"""Optional cosine similarity threshold for semantic topic boundary detection."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class EmbeddingConfig:
|
||
|
|
"""Embedding configuration for text chunks."""
|
||
|
|
|
||
|
|
model: EmbeddingModelType | None = None
|
||
|
|
|
||
|
|
"""The embedding model to use (defaults to "balanced" preset if not specified)"""
|
||
|
|
|
||
|
|
normalize: bool = True
|
||
|
|
|
||
|
|
"""Whether to normalize embedding vectors (recommended for cosine similarity)"""
|
||
|
|
|
||
|
|
batch_size: int = 32
|
||
|
|
|
||
|
|
"""Batch size for embedding generation"""
|
||
|
|
|
||
|
|
show_download_progress: bool = False
|
||
|
|
|
||
|
|
"""Show model download progress"""
|
||
|
|
|
||
|
|
cache_dir: str | None = None
|
||
|
|
|
||
|
|
"""Custom cache directory for model files"""
|
||
|
|
|
||
|
|
acceleration: AccelerationConfig | None = None
|
||
|
|
|
||
|
|
"""Hardware acceleration for the embedding ONNX model."""
|
||
|
|
|
||
|
|
max_embed_duration_secs: int | None = None
|
||
|
|
|
||
|
|
"""Maximum wall-clock duration (in seconds) for a single `embed()` call when using [`EmbeddingModelType::Plugin`]."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TreeSitterConfig:
|
||
|
|
"""Configuration for tree-sitter language pack integration."""
|
||
|
|
|
||
|
|
enabled: bool = True
|
||
|
|
|
||
|
|
"""Enable code intelligence processing (default: true)."""
|
||
|
|
|
||
|
|
cache_dir: str | None = None
|
||
|
|
|
||
|
|
"""Custom cache directory for downloaded grammars."""
|
||
|
|
|
||
|
|
languages: list[str] | None = None
|
||
|
|
|
||
|
|
"""Languages to pre-download on init (e.g., `["python", "rust"]`)."""
|
||
|
|
|
||
|
|
groups: list[str] | None = None
|
||
|
|
|
||
|
|
"""Language groups to pre-download (e.g., `["web", "systems", "scripting"]`)."""
|
||
|
|
|
||
|
|
process: TreeSitterProcessConfig | None = None
|
||
|
|
|
||
|
|
"""Processing options for code analysis."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TreeSitterProcessConfig:
|
||
|
|
"""Processing options for tree-sitter code analysis."""
|
||
|
|
|
||
|
|
structure: bool = True
|
||
|
|
|
||
|
|
"""Extract structural items (functions, classes, structs, etc.). Default: true."""
|
||
|
|
|
||
|
|
imports: bool = True
|
||
|
|
|
||
|
|
"""Extract import statements. Default: true."""
|
||
|
|
|
||
|
|
exports: bool = True
|
||
|
|
|
||
|
|
"""Extract export statements. Default: true."""
|
||
|
|
|
||
|
|
comments: bool = False
|
||
|
|
|
||
|
|
"""Extract comments. Default: false."""
|
||
|
|
|
||
|
|
docstrings: bool = False
|
||
|
|
|
||
|
|
"""Extract docstrings. Default: false."""
|
||
|
|
|
||
|
|
symbols: bool = False
|
||
|
|
|
||
|
|
"""Extract symbol definitions. Default: false."""
|
||
|
|
|
||
|
|
diagnostics: bool = False
|
||
|
|
|
||
|
|
"""Include parse diagnostics. Default: false."""
|
||
|
|
|
||
|
|
chunk_max_size: int | None = None
|
||
|
|
|
||
|
|
"""Maximum chunk size in bytes. `None` disables chunking."""
|
||
|
|
|
||
|
|
content_mode: CodeContentMode | str = "chunks"
|
||
|
|
|
||
|
|
"""Content rendering mode for code extraction."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ServerConfig:
|
||
|
|
"""API server configuration."""
|
||
|
|
|
||
|
|
host: str = ""
|
||
|
|
|
||
|
|
"""Server host address (e.g., "127.0.0.1", "0.0.0.0")"""
|
||
|
|
|
||
|
|
port: int = 0
|
||
|
|
|
||
|
|
"""Server port number"""
|
||
|
|
|
||
|
|
cors_origins: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""CORS allowed origins. Empty vector means allow all origins."""
|
||
|
|
|
||
|
|
max_request_body_bytes: int = 0
|
||
|
|
|
||
|
|
"""Maximum size of request body in bytes (default: 100 MB)"""
|
||
|
|
|
||
|
|
max_multipart_field_bytes: int = 0
|
||
|
|
|
||
|
|
"""Maximum size of multipart fields in bytes (default: 100 MB)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class DocxAppProperties:
|
||
|
|
"""Application properties from docProps/app.xml for DOCX."""
|
||
|
|
|
||
|
|
application: str | None = None
|
||
|
|
|
||
|
|
"""Application name (e.g., "Microsoft Office Word")"""
|
||
|
|
|
||
|
|
app_version: str | None = None
|
||
|
|
|
||
|
|
"""Application version"""
|
||
|
|
|
||
|
|
template: str | None = None
|
||
|
|
|
||
|
|
"""Template filename"""
|
||
|
|
|
||
|
|
total_time: int | None = None
|
||
|
|
|
||
|
|
"""Total editing time in minutes"""
|
||
|
|
|
||
|
|
pages: int | None = None
|
||
|
|
|
||
|
|
"""Number of pages"""
|
||
|
|
|
||
|
|
words: int | None = None
|
||
|
|
|
||
|
|
"""Number of words"""
|
||
|
|
|
||
|
|
characters: int | None = None
|
||
|
|
|
||
|
|
"""Number of characters (excluding spaces)"""
|
||
|
|
|
||
|
|
characters_with_spaces: int | None = None
|
||
|
|
|
||
|
|
"""Number of characters (including spaces)"""
|
||
|
|
|
||
|
|
lines: int | None = None
|
||
|
|
|
||
|
|
"""Number of lines"""
|
||
|
|
|
||
|
|
paragraphs: int | None = None
|
||
|
|
|
||
|
|
"""Number of paragraphs"""
|
||
|
|
|
||
|
|
company: str | None = None
|
||
|
|
|
||
|
|
"""Company name"""
|
||
|
|
|
||
|
|
doc_security: int | None = None
|
||
|
|
|
||
|
|
"""Document security level"""
|
||
|
|
|
||
|
|
scale_crop: bool | None = None
|
||
|
|
|
||
|
|
"""Scale crop flag"""
|
||
|
|
|
||
|
|
links_up_to_date: bool | None = None
|
||
|
|
|
||
|
|
"""Links up to date flag"""
|
||
|
|
|
||
|
|
shared_doc: bool | None = None
|
||
|
|
|
||
|
|
"""Shared document flag"""
|
||
|
|
|
||
|
|
hyperlinks_changed: bool | None = None
|
||
|
|
|
||
|
|
"""Hyperlinks changed flag"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class XlsxAppProperties:
|
||
|
|
"""Application properties from docProps/app.xml for XLSX."""
|
||
|
|
|
||
|
|
application: str | None = None
|
||
|
|
|
||
|
|
"""Application name (e.g., "Microsoft Excel")"""
|
||
|
|
|
||
|
|
app_version: str | None = None
|
||
|
|
|
||
|
|
"""Application version"""
|
||
|
|
|
||
|
|
doc_security: int | None = None
|
||
|
|
|
||
|
|
"""Document security level"""
|
||
|
|
|
||
|
|
scale_crop: bool | None = None
|
||
|
|
|
||
|
|
"""Scale crop flag"""
|
||
|
|
|
||
|
|
links_up_to_date: bool | None = None
|
||
|
|
|
||
|
|
"""Links up to date flag"""
|
||
|
|
|
||
|
|
shared_doc: bool | None = None
|
||
|
|
|
||
|
|
"""Shared document flag"""
|
||
|
|
|
||
|
|
hyperlinks_changed: bool | None = None
|
||
|
|
|
||
|
|
"""Hyperlinks changed flag"""
|
||
|
|
|
||
|
|
company: str | None = None
|
||
|
|
|
||
|
|
"""Company name"""
|
||
|
|
|
||
|
|
worksheet_names: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Worksheet names"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PptxAppProperties:
|
||
|
|
"""Application properties from docProps/app.xml for PPTX."""
|
||
|
|
|
||
|
|
application: str | None = None
|
||
|
|
|
||
|
|
"""Application name (e.g., "Microsoft Office PowerPoint")"""
|
||
|
|
|
||
|
|
app_version: str | None = None
|
||
|
|
|
||
|
|
"""Application version"""
|
||
|
|
|
||
|
|
total_time: int | None = None
|
||
|
|
|
||
|
|
"""Total editing time in minutes"""
|
||
|
|
|
||
|
|
company: str | None = None
|
||
|
|
|
||
|
|
"""Company name"""
|
||
|
|
|
||
|
|
doc_security: int | None = None
|
||
|
|
|
||
|
|
"""Document security level"""
|
||
|
|
|
||
|
|
scale_crop: bool | None = None
|
||
|
|
|
||
|
|
"""Scale crop flag"""
|
||
|
|
|
||
|
|
links_up_to_date: bool | None = None
|
||
|
|
|
||
|
|
"""Links up to date flag"""
|
||
|
|
|
||
|
|
shared_doc: bool | None = None
|
||
|
|
|
||
|
|
"""Shared document flag"""
|
||
|
|
|
||
|
|
hyperlinks_changed: bool | None = None
|
||
|
|
|
||
|
|
"""Hyperlinks changed flag"""
|
||
|
|
|
||
|
|
slides: int | None = None
|
||
|
|
|
||
|
|
"""Number of slides"""
|
||
|
|
|
||
|
|
notes: int | None = None
|
||
|
|
|
||
|
|
"""Number of notes"""
|
||
|
|
|
||
|
|
hidden_slides: int | None = None
|
||
|
|
|
||
|
|
"""Number of hidden slides"""
|
||
|
|
|
||
|
|
multimedia_clips: int | None = None
|
||
|
|
|
||
|
|
"""Number of multimedia clips"""
|
||
|
|
|
||
|
|
presentation_format: str | None = None
|
||
|
|
|
||
|
|
"""Presentation format (e.g., "Widescreen", "Standard")"""
|
||
|
|
|
||
|
|
slide_titles: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Slide titles"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class CoreProperties:
|
||
|
|
"""Dublin Core metadata from docProps/core.xml."""
|
||
|
|
|
||
|
|
title: str | None = None
|
||
|
|
|
||
|
|
"""Document title"""
|
||
|
|
|
||
|
|
subject: str | None = None
|
||
|
|
|
||
|
|
"""Document subject/topic"""
|
||
|
|
|
||
|
|
creator: str | None = None
|
||
|
|
|
||
|
|
"""Document creator/author"""
|
||
|
|
|
||
|
|
keywords: str | None = None
|
||
|
|
|
||
|
|
"""Keywords or tags"""
|
||
|
|
|
||
|
|
description: str | None = None
|
||
|
|
|
||
|
|
"""Document description/abstract"""
|
||
|
|
|
||
|
|
last_modified_by: str | None = None
|
||
|
|
|
||
|
|
"""User who last modified the document"""
|
||
|
|
|
||
|
|
revision: str | None = None
|
||
|
|
|
||
|
|
"""Revision number"""
|
||
|
|
|
||
|
|
created: str | None = None
|
||
|
|
|
||
|
|
"""Creation timestamp (ISO 8601)"""
|
||
|
|
|
||
|
|
modified: str | None = None
|
||
|
|
|
||
|
|
"""Last modification timestamp (ISO 8601)"""
|
||
|
|
|
||
|
|
category: str | None = None
|
||
|
|
|
||
|
|
"""Document category"""
|
||
|
|
|
||
|
|
content_status: str | None = None
|
||
|
|
|
||
|
|
"""Content status (Draft, Final, etc.)"""
|
||
|
|
|
||
|
|
language: str | None = None
|
||
|
|
|
||
|
|
"""Document language"""
|
||
|
|
|
||
|
|
identifier: str | None = None
|
||
|
|
|
||
|
|
"""Unique identifier"""
|
||
|
|
|
||
|
|
version: str | None = None
|
||
|
|
|
||
|
|
"""Document version"""
|
||
|
|
|
||
|
|
last_printed: str | None = None
|
||
|
|
|
||
|
|
"""Last print timestamp (ISO 8601)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class SecurityLimits:
|
||
|
|
"""Configuration for security limits across extractors."""
|
||
|
|
|
||
|
|
max_archive_size: int = 524288000
|
||
|
|
|
||
|
|
"""Maximum uncompressed size for archives (500 MB)"""
|
||
|
|
|
||
|
|
max_compression_ratio: int = 100
|
||
|
|
|
||
|
|
"""Maximum compression ratio before flagging as potential bomb (100:1)"""
|
||
|
|
|
||
|
|
max_files_in_archive: int = 10000
|
||
|
|
|
||
|
|
"""Maximum number of files in archive (10,000)"""
|
||
|
|
|
||
|
|
max_nesting_depth: int = 1024
|
||
|
|
|
||
|
|
"""Maximum nesting depth for structures (100)"""
|
||
|
|
|
||
|
|
max_entity_length: int = 1048576
|
||
|
|
|
||
|
|
"""Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token cap, NOT a total cap -- billion-laughs class attacks where a single entity expands to hundreds of MB are caught here, while normal long text content (a paragraph, a CDATA block) is caught by `max_content_size` instead."""
|
||
|
|
|
||
|
|
max_content_size: int = 104857600
|
||
|
|
|
||
|
|
"""Maximum string growth per document (100 MB)"""
|
||
|
|
|
||
|
|
max_iterations: int = 10000000
|
||
|
|
|
||
|
|
"""Maximum iterations per operation"""
|
||
|
|
|
||
|
|
max_xml_depth: int = 1024
|
||
|
|
|
||
|
|
"""Maximum XML depth (100 levels)"""
|
||
|
|
|
||
|
|
max_table_cells: int = 100000
|
||
|
|
|
||
|
|
"""Maximum cells per table (100,000)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TokenReductionConfig:
|
||
|
|
"""Token reduction config."""
|
||
|
|
|
||
|
|
level: ReductionLevel | str = "moderate"
|
||
|
|
|
||
|
|
language_hint: str | None = None
|
||
|
|
|
||
|
|
preserve_markdown: bool = False
|
||
|
|
|
||
|
|
preserve_code: bool = True
|
||
|
|
|
||
|
|
semantic_threshold: float = 0.3
|
||
|
|
|
||
|
|
enable_parallel: bool = True
|
||
|
|
|
||
|
|
use_simd: bool = True
|
||
|
|
|
||
|
|
custom_stopwords: dict[str, list[str]] | None = None
|
||
|
|
|
||
|
|
preserve_patterns: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
target_reduction: float | None = None
|
||
|
|
|
||
|
|
enable_semantic_clustering: bool = False
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class DocumentStructure:
|
||
|
|
"""Top-level structured document representation."""
|
||
|
|
|
||
|
|
nodes: list[DocumentNode] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""All nodes in document/reading order."""
|
||
|
|
|
||
|
|
source_format: str | None = None
|
||
|
|
|
||
|
|
"""Origin format identifier (e.g. "docx", "pptx", "html", "pdf")."""
|
||
|
|
|
||
|
|
relationships: list[DocumentRelationship] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Resolved relationships between nodes (footnote refs, citations, anchor links, etc.)."""
|
||
|
|
|
||
|
|
node_types: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Sorted, deduplicated list of node type names present in this document."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TableGrid:
|
||
|
|
"""Structured table grid with cell-level metadata."""
|
||
|
|
|
||
|
|
rows: int = 0
|
||
|
|
|
||
|
|
"""Number of rows in the table."""
|
||
|
|
|
||
|
|
cols: int = 0
|
||
|
|
|
||
|
|
"""Number of columns in the table."""
|
||
|
|
|
||
|
|
cells: list[GridCell] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""All cells in row-major order."""
|
||
|
|
|
||
|
|
|
||
|
|
class ExtractionResult(TypedDict, total=False):
|
||
|
|
"""General extraction result used by the core extraction API."""
|
||
|
|
content: str
|
||
|
|
|
||
|
|
mime_type: str
|
||
|
|
|
||
|
|
metadata: Metadata
|
||
|
|
|
||
|
|
extraction_method: ExtractionMethod | str | None
|
||
|
|
|
||
|
|
"""Extraction strategy used to produce the returned text."""
|
||
|
|
tables: list[Table]
|
||
|
|
|
||
|
|
detected_languages: list[str] | None
|
||
|
|
|
||
|
|
chunks: list[Chunk] | None
|
||
|
|
|
||
|
|
"""Text chunks when chunking is enabled."""
|
||
|
|
images: list[ExtractedImage] | None
|
||
|
|
|
||
|
|
"""Extracted images from the document."""
|
||
|
|
pages: list[PageContent] | None
|
||
|
|
|
||
|
|
"""Per-page content when page extraction is enabled."""
|
||
|
|
elements: list[Element] | None
|
||
|
|
|
||
|
|
"""Semantic elements when element-based result format is enabled."""
|
||
|
|
djot_content: DjotContent | None
|
||
|
|
|
||
|
|
"""Rich Djot content structure (when extracting Djot documents)."""
|
||
|
|
ocr_elements: list[OcrElement] | None
|
||
|
|
|
||
|
|
"""OCR elements with full spatial and confidence metadata."""
|
||
|
|
document: DocumentStructure | None
|
||
|
|
|
||
|
|
"""Structured document tree (when document structure extraction is enabled)."""
|
||
|
|
extracted_keywords: list[Keyword] | None
|
||
|
|
|
||
|
|
"""Extracted keywords when keyword extraction is enabled."""
|
||
|
|
quality_score: float | None
|
||
|
|
|
||
|
|
"""Document quality score from quality analysis."""
|
||
|
|
processing_warnings: list[ProcessingWarning]
|
||
|
|
|
||
|
|
"""Non-fatal warnings collected during processing pipeline stages."""
|
||
|
|
annotations: list[PdfAnnotation] | None
|
||
|
|
|
||
|
|
"""PDF annotations extracted from the document."""
|
||
|
|
children: list[ArchiveEntry] | None
|
||
|
|
|
||
|
|
"""Nested extraction results from archive contents."""
|
||
|
|
uris: list[ExtractedUri] | None
|
||
|
|
|
||
|
|
"""URIs/links discovered during document extraction."""
|
||
|
|
revisions: list[DocumentRevision] | None
|
||
|
|
|
||
|
|
"""Tracked changes embedded in the source document."""
|
||
|
|
structured_output: str | None
|
||
|
|
|
||
|
|
"""Structured extraction output from LLM-based JSON schema extraction."""
|
||
|
|
code_intelligence: str | None
|
||
|
|
|
||
|
|
"""Code intelligence results from tree-sitter analysis."""
|
||
|
|
llm_usage: list[LlmUsage] | None
|
||
|
|
|
||
|
|
"""LLM token usage and cost data for all LLM calls made during this extraction."""
|
||
|
|
formatted_content: str | None
|
||
|
|
|
||
|
|
"""Pre-rendered content in the requested output format."""
|
||
|
|
ocr_internal_document: str | None
|
||
|
|
|
||
|
|
"""Structured hOCR document for the OCR+layout pipeline."""
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class LlmUsage:
|
||
|
|
"""Token usage and cost data for a single LLM call made during extraction."""
|
||
|
|
|
||
|
|
model: str = ""
|
||
|
|
|
||
|
|
"""The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514")."""
|
||
|
|
|
||
|
|
source: str = ""
|
||
|
|
|
||
|
|
"""The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction", "embeddings")."""
|
||
|
|
|
||
|
|
input_tokens: int | None = None
|
||
|
|
|
||
|
|
"""Number of input/prompt tokens consumed."""
|
||
|
|
|
||
|
|
output_tokens: int | None = None
|
||
|
|
|
||
|
|
"""Number of output/completion tokens generated."""
|
||
|
|
|
||
|
|
total_tokens: int | None = None
|
||
|
|
|
||
|
|
"""Total tokens (input + output)."""
|
||
|
|
|
||
|
|
estimated_cost: float | None = None
|
||
|
|
|
||
|
|
"""Estimated cost in USD based on the provider's published pricing."""
|
||
|
|
|
||
|
|
finish_reason: str | None = None
|
||
|
|
|
||
|
|
"""Why the model stopped generating (e.g. "stop", "length", "content_filter")."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class BoundingBox:
|
||
|
|
"""Bounding box coordinates for element positioning."""
|
||
|
|
|
||
|
|
x0: float = 0.0
|
||
|
|
|
||
|
|
"""Left x-coordinate"""
|
||
|
|
|
||
|
|
y0: float = 0.0
|
||
|
|
|
||
|
|
"""Bottom y-coordinate"""
|
||
|
|
|
||
|
|
x1: float = 0.0
|
||
|
|
|
||
|
|
"""Right x-coordinate"""
|
||
|
|
|
||
|
|
y1: float = 0.0
|
||
|
|
|
||
|
|
"""Top y-coordinate"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ImagePreprocessingConfig:
|
||
|
|
"""Image preprocessing configuration for OCR."""
|
||
|
|
|
||
|
|
target_dpi: int = 300
|
||
|
|
|
||
|
|
"""Target DPI for the image (300 is standard, 600 for small text)."""
|
||
|
|
|
||
|
|
auto_rotate: bool = True
|
||
|
|
|
||
|
|
"""Auto-detect and correct image rotation."""
|
||
|
|
|
||
|
|
deskew: bool = True
|
||
|
|
|
||
|
|
"""Correct skew (tilted images)."""
|
||
|
|
|
||
|
|
denoise: bool = False
|
||
|
|
|
||
|
|
"""Remove noise from the image."""
|
||
|
|
|
||
|
|
contrast_enhance: bool = False
|
||
|
|
|
||
|
|
"""Enhance contrast for better text visibility."""
|
||
|
|
|
||
|
|
binarization_method: str = "otsu"
|
||
|
|
|
||
|
|
"""Binarization method: "otsu", "sauvola", "adaptive"."""
|
||
|
|
|
||
|
|
invert_colors: bool = False
|
||
|
|
|
||
|
|
"""Invert colors (white text on black → black on white)."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TesseractConfig:
|
||
|
|
"""Tesseract OCR configuration."""
|
||
|
|
|
||
|
|
language: str = "eng"
|
||
|
|
|
||
|
|
"""Language code (e.g., "eng", "deu", "fra")"""
|
||
|
|
|
||
|
|
psm: int = 3
|
||
|
|
|
||
|
|
"""Page Segmentation Mode (0-13)."""
|
||
|
|
|
||
|
|
output_format: str = "markdown"
|
||
|
|
|
||
|
|
"""Output format ("text" or "markdown")"""
|
||
|
|
|
||
|
|
oem: int = 3
|
||
|
|
|
||
|
|
"""OCR Engine Mode (0-3)."""
|
||
|
|
|
||
|
|
min_confidence: float = 0
|
||
|
|
|
||
|
|
"""Minimum confidence threshold (0.0-100.0)."""
|
||
|
|
|
||
|
|
preprocessing: ImagePreprocessingConfig | None = None
|
||
|
|
|
||
|
|
"""Image preprocessing configuration."""
|
||
|
|
|
||
|
|
enable_table_detection: bool = True
|
||
|
|
|
||
|
|
"""Enable automatic table detection and reconstruction"""
|
||
|
|
|
||
|
|
table_min_confidence: float = 0
|
||
|
|
|
||
|
|
"""Minimum confidence threshold for table detection (0.0-1.0)"""
|
||
|
|
|
||
|
|
table_column_threshold: int = 50
|
||
|
|
|
||
|
|
"""Column threshold for table detection (pixels)"""
|
||
|
|
|
||
|
|
table_row_threshold_ratio: float = 0.5
|
||
|
|
|
||
|
|
"""Row threshold ratio for table detection (0.0-1.0)"""
|
||
|
|
|
||
|
|
use_cache: bool = True
|
||
|
|
|
||
|
|
"""Enable OCR result caching"""
|
||
|
|
|
||
|
|
classify_use_pre_adapted_templates: bool = True
|
||
|
|
|
||
|
|
"""Use pre-adapted templates for character classification"""
|
||
|
|
|
||
|
|
language_model_ngram_on: bool = False
|
||
|
|
|
||
|
|
"""Enable N-gram language model"""
|
||
|
|
|
||
|
|
tessedit_dont_blkrej_good_wds: bool = True
|
||
|
|
|
||
|
|
"""Don't reject good words during block-level processing"""
|
||
|
|
|
||
|
|
tessedit_dont_rowrej_good_wds: bool = True
|
||
|
|
|
||
|
|
"""Don't reject good words during row-level processing"""
|
||
|
|
|
||
|
|
tessedit_enable_dict_correction: bool = True
|
||
|
|
|
||
|
|
"""Enable dictionary correction"""
|
||
|
|
|
||
|
|
tessedit_char_whitelist: str = ""
|
||
|
|
|
||
|
|
"""Whitelist of allowed characters (empty = all allowed)"""
|
||
|
|
|
||
|
|
tessedit_char_blacklist: str = ""
|
||
|
|
|
||
|
|
"""Blacklist of forbidden characters (empty = none forbidden)"""
|
||
|
|
|
||
|
|
tessedit_use_primary_params_model: bool = True
|
||
|
|
|
||
|
|
"""Use primary language params model"""
|
||
|
|
|
||
|
|
textord_space_size_is_variable: bool = True
|
||
|
|
|
||
|
|
"""Variable-width space detection"""
|
||
|
|
|
||
|
|
thresholding_method: bool = False
|
||
|
|
|
||
|
|
"""Use adaptive thresholding method"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class Metadata:
|
||
|
|
"""Extraction result metadata."""
|
||
|
|
|
||
|
|
title: str | None = None
|
||
|
|
|
||
|
|
"""Document title"""
|
||
|
|
|
||
|
|
subject: str | None = None
|
||
|
|
|
||
|
|
"""Document subject or description"""
|
||
|
|
|
||
|
|
authors: list[str] | None = None
|
||
|
|
|
||
|
|
"""Primary author(s) - always Vec for consistency"""
|
||
|
|
|
||
|
|
keywords: list[str] | None = None
|
||
|
|
|
||
|
|
"""Keywords/tags - always Vec for consistency"""
|
||
|
|
|
||
|
|
language: str | None = None
|
||
|
|
|
||
|
|
"""Primary language (ISO 639 code)"""
|
||
|
|
|
||
|
|
created_at: str | None = None
|
||
|
|
|
||
|
|
"""Creation timestamp (ISO 8601 format)"""
|
||
|
|
|
||
|
|
modified_at: str | None = None
|
||
|
|
|
||
|
|
"""Last modification timestamp (ISO 8601 format)"""
|
||
|
|
|
||
|
|
created_by: str | None = None
|
||
|
|
|
||
|
|
"""User who created the document"""
|
||
|
|
|
||
|
|
modified_by: str | None = None
|
||
|
|
|
||
|
|
"""User who last modified the document"""
|
||
|
|
|
||
|
|
pages: PageStructure | None = None
|
||
|
|
|
||
|
|
"""Page/slide/sheet structure with boundaries"""
|
||
|
|
|
||
|
|
format: FormatMetadata | None = None
|
||
|
|
|
||
|
|
"""Format-specific metadata (discriminated union)"""
|
||
|
|
|
||
|
|
image_preprocessing: ImagePreprocessingMetadata | None = None
|
||
|
|
|
||
|
|
"""Image preprocessing metadata (when OCR preprocessing was applied)"""
|
||
|
|
|
||
|
|
json_schema: str | None = None
|
||
|
|
|
||
|
|
"""JSON schema (for structured data extraction)"""
|
||
|
|
|
||
|
|
error: ErrorMetadata | None = None
|
||
|
|
|
||
|
|
"""Error metadata (for batch operations)"""
|
||
|
|
|
||
|
|
extraction_duration_ms: int | None = None
|
||
|
|
|
||
|
|
"""Extraction duration in milliseconds (for benchmarking)."""
|
||
|
|
|
||
|
|
category: str | None = None
|
||
|
|
|
||
|
|
"""Document category (from frontmatter or classification)."""
|
||
|
|
|
||
|
|
tags: list[str] | None = None
|
||
|
|
|
||
|
|
"""Document tags (from frontmatter)."""
|
||
|
|
|
||
|
|
document_version: str | None = None
|
||
|
|
|
||
|
|
"""Document version string (from frontmatter)."""
|
||
|
|
|
||
|
|
abstract_text: str | None = None
|
||
|
|
|
||
|
|
"""Abstract or summary text (from frontmatter)."""
|
||
|
|
|
||
|
|
output_format: str | None = None
|
||
|
|
|
||
|
|
"""Output format identifier (e.g., "markdown", "html", "text")."""
|
||
|
|
|
||
|
|
ocr_used: bool = False
|
||
|
|
|
||
|
|
"""Whether OCR was used during extraction."""
|
||
|
|
|
||
|
|
additional: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""Additional custom fields from postprocessors."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ExcelMetadata:
|
||
|
|
"""Excel/spreadsheet format metadata."""
|
||
|
|
|
||
|
|
sheet_count: int | None = None
|
||
|
|
|
||
|
|
"""Number of sheets in the workbook."""
|
||
|
|
|
||
|
|
sheet_names: list[str] | None = None
|
||
|
|
|
||
|
|
"""Names of all sheets in the workbook."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class EmailMetadata:
|
||
|
|
"""Email metadata extracted from .eml and .msg files."""
|
||
|
|
|
||
|
|
from_email: str | None = None
|
||
|
|
|
||
|
|
"""Sender's email address"""
|
||
|
|
|
||
|
|
from_name: str | None = None
|
||
|
|
|
||
|
|
"""Sender's display name"""
|
||
|
|
|
||
|
|
to_emails: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Primary recipients"""
|
||
|
|
|
||
|
|
cc_emails: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""CC recipients"""
|
||
|
|
|
||
|
|
bcc_emails: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""BCC recipients"""
|
||
|
|
|
||
|
|
message_id: str | None = None
|
||
|
|
|
||
|
|
"""Message-ID header value"""
|
||
|
|
|
||
|
|
attachments: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""List of attachment filenames"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ArchiveMetadata:
|
||
|
|
"""Archive (ZIP/TAR/7Z) metadata."""
|
||
|
|
|
||
|
|
format: str = ""
|
||
|
|
|
||
|
|
"""Archive format ("ZIP", "TAR", "7Z", etc.)"""
|
||
|
|
|
||
|
|
file_count: int = 0
|
||
|
|
|
||
|
|
"""Total number of files in the archive"""
|
||
|
|
|
||
|
|
file_list: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""List of file paths within the archive"""
|
||
|
|
|
||
|
|
total_size: int = 0
|
||
|
|
|
||
|
|
"""Total uncompressed size in bytes"""
|
||
|
|
|
||
|
|
compressed_size: int | None = None
|
||
|
|
|
||
|
|
"""Compressed size in bytes (if available)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class ImageMetadata:
|
||
|
|
"""Image metadata extracted from image files."""
|
||
|
|
|
||
|
|
width: int = 0
|
||
|
|
|
||
|
|
"""Image width in pixels"""
|
||
|
|
|
||
|
|
height: int = 0
|
||
|
|
|
||
|
|
"""Image height in pixels"""
|
||
|
|
|
||
|
|
format: str = ""
|
||
|
|
|
||
|
|
"""Image format (e.g., "PNG", "JPEG", "TIFF")"""
|
||
|
|
|
||
|
|
exif: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""EXIF metadata tags"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class XmlMetadata:
|
||
|
|
"""XML metadata extracted during XML parsing."""
|
||
|
|
|
||
|
|
element_count: int = 0
|
||
|
|
|
||
|
|
"""Total number of XML elements processed"""
|
||
|
|
|
||
|
|
unique_elements: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""List of unique element tag names (sorted)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TextMetadata:
|
||
|
|
"""Text/Markdown metadata."""
|
||
|
|
|
||
|
|
line_count: int = 0
|
||
|
|
|
||
|
|
"""Number of lines in the document"""
|
||
|
|
|
||
|
|
word_count: int = 0
|
||
|
|
|
||
|
|
"""Number of words"""
|
||
|
|
|
||
|
|
character_count: int = 0
|
||
|
|
|
||
|
|
"""Number of characters"""
|
||
|
|
|
||
|
|
headers: list[str] | None = None
|
||
|
|
|
||
|
|
"""Markdown headers (headings text only, for Markdown files)"""
|
||
|
|
|
||
|
|
links: list[list[str]] | None = None
|
||
|
|
|
||
|
|
"""Markdown links as (text, url) tuples (for Markdown files)"""
|
||
|
|
|
||
|
|
code_blocks: list[list[str]] | None = None
|
||
|
|
|
||
|
|
"""Code blocks as (language, code) tuples (for Markdown files)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class HtmlMetadata:
|
||
|
|
"""HTML metadata extracted from HTML documents."""
|
||
|
|
|
||
|
|
title: str | None = None
|
||
|
|
|
||
|
|
"""Document title from `<title>` tag"""
|
||
|
|
|
||
|
|
description: str | None = None
|
||
|
|
|
||
|
|
"""Document description from `<meta name="description">` tag"""
|
||
|
|
|
||
|
|
keywords: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Document keywords from `<meta name="keywords">` tag, split on commas"""
|
||
|
|
|
||
|
|
author: str | None = None
|
||
|
|
|
||
|
|
"""Document author from `<meta name="author">` tag"""
|
||
|
|
|
||
|
|
canonical_url: str | None = None
|
||
|
|
|
||
|
|
"""Canonical URL from `<link rel="canonical">` tag"""
|
||
|
|
|
||
|
|
base_href: str | None = None
|
||
|
|
|
||
|
|
"""Base URL from `<base href="">` tag for resolving relative URLs"""
|
||
|
|
|
||
|
|
language: str | None = None
|
||
|
|
|
||
|
|
"""Document language from `lang` attribute"""
|
||
|
|
|
||
|
|
text_direction: TextDirection | str | None = None
|
||
|
|
|
||
|
|
"""Document text direction from `dir` attribute"""
|
||
|
|
|
||
|
|
open_graph: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""Open Graph metadata (og:* properties) for social media Keys like "title", "description", "image", "url", etc."""
|
||
|
|
|
||
|
|
twitter_card: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""Twitter Card metadata (twitter:* properties) Keys like "card", "site", "creator", "title", "description", "image", etc."""
|
||
|
|
|
||
|
|
meta_tags: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""Additional meta tags not covered by specific fields Keys are meta name/property attributes, values are content"""
|
||
|
|
|
||
|
|
headers: list[HeaderMetadata] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Extracted header elements with hierarchy"""
|
||
|
|
|
||
|
|
links: list[LinkMetadata] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Extracted hyperlinks with type classification"""
|
||
|
|
|
||
|
|
images: list[ImageMetadataType] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Extracted images with source and dimensions"""
|
||
|
|
|
||
|
|
structured_data: list[StructuredData] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Extracted structured data blocks"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrMetadata:
|
||
|
|
"""OCR processing metadata."""
|
||
|
|
|
||
|
|
language: str = ""
|
||
|
|
|
||
|
|
"""OCR language code(s) used"""
|
||
|
|
|
||
|
|
psm: int = 0
|
||
|
|
|
||
|
|
"""Tesseract Page Segmentation Mode (PSM)"""
|
||
|
|
|
||
|
|
output_format: str = ""
|
||
|
|
|
||
|
|
"""Output format (e.g., "text", "hocr")"""
|
||
|
|
|
||
|
|
table_count: int = 0
|
||
|
|
|
||
|
|
"""Number of tables detected"""
|
||
|
|
|
||
|
|
table_rows: int | None = None
|
||
|
|
|
||
|
|
table_cols: int | None = None
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PptxMetadata:
|
||
|
|
"""PowerPoint presentation metadata."""
|
||
|
|
|
||
|
|
slide_count: int = 0
|
||
|
|
|
||
|
|
"""Total number of slides in the presentation"""
|
||
|
|
|
||
|
|
slide_names: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Names of slides (if available)"""
|
||
|
|
|
||
|
|
image_count: int | None = None
|
||
|
|
|
||
|
|
"""Number of embedded images"""
|
||
|
|
|
||
|
|
table_count: int | None = None
|
||
|
|
|
||
|
|
"""Number of tables"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class DocxMetadata:
|
||
|
|
"""Word document metadata."""
|
||
|
|
|
||
|
|
core_properties: CoreProperties | None = None
|
||
|
|
|
||
|
|
"""Core properties from docProps/core.xml (Dublin Core metadata)"""
|
||
|
|
|
||
|
|
app_properties: DocxAppProperties | None = None
|
||
|
|
|
||
|
|
"""Application properties from docProps/app.xml (Word-specific statistics)"""
|
||
|
|
|
||
|
|
custom_properties: dict[str, str] | None = None
|
||
|
|
|
||
|
|
"""Custom properties from docProps/custom.xml (user-defined properties)"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class CsvMetadata:
|
||
|
|
"""CSV/TSV file metadata."""
|
||
|
|
|
||
|
|
row_count: int = 0
|
||
|
|
|
||
|
|
column_count: int = 0
|
||
|
|
|
||
|
|
delimiter: str | None = None
|
||
|
|
|
||
|
|
has_header: bool = False
|
||
|
|
|
||
|
|
column_types: list[str] | None = None
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class BibtexMetadata:
|
||
|
|
"""BibTeX bibliography metadata."""
|
||
|
|
|
||
|
|
entry_count: int = 0
|
||
|
|
|
||
|
|
"""Number of entries in the bibliography."""
|
||
|
|
|
||
|
|
citation_keys: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
authors: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
year_range: YearRange | None = None
|
||
|
|
|
||
|
|
entry_types: dict[str, int] | None = None
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class CitationMetadata:
|
||
|
|
"""Citation file metadata (RIS, PubMed, EndNote)."""
|
||
|
|
|
||
|
|
citation_count: int = 0
|
||
|
|
|
||
|
|
format: str | None = None
|
||
|
|
|
||
|
|
authors: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
year_range: YearRange | None = None
|
||
|
|
|
||
|
|
dois: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
keywords: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class FictionBookMetadata:
|
||
|
|
"""FictionBook (FB2) metadata."""
|
||
|
|
|
||
|
|
genres: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
sequences: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
annotation: str | None = None
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class DbfMetadata:
|
||
|
|
"""dBASE (DBF) file metadata."""
|
||
|
|
|
||
|
|
record_count: int = 0
|
||
|
|
|
||
|
|
field_count: int = 0
|
||
|
|
|
||
|
|
fields: list[DbfFieldInfo] = field(default_factory=list)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class JatsMetadata:
|
||
|
|
"""JATS (Journal Article Tag Suite) metadata."""
|
||
|
|
|
||
|
|
copyright: str | None = None
|
||
|
|
|
||
|
|
license: str | None = None
|
||
|
|
|
||
|
|
history_dates: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
contributor_roles: list[ContributorRole] = field(default_factory=list)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class EpubMetadata:
|
||
|
|
"""EPUB metadata (Dublin Core extensions)."""
|
||
|
|
|
||
|
|
coverage: str | None = None
|
||
|
|
|
||
|
|
dc_format: str | None = None
|
||
|
|
|
||
|
|
relation: str | None = None
|
||
|
|
|
||
|
|
source: str | None = None
|
||
|
|
|
||
|
|
dc_type: str | None = None
|
||
|
|
|
||
|
|
cover_image: str | None = None
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PstMetadata:
|
||
|
|
"""Outlook PST archive metadata."""
|
||
|
|
|
||
|
|
message_count: int = 0
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrConfidence:
|
||
|
|
"""Confidence scores for an OCR element."""
|
||
|
|
|
||
|
|
detection: float | None = None
|
||
|
|
|
||
|
|
"""Detection confidence: how confident the OCR engine is that text exists here."""
|
||
|
|
|
||
|
|
recognition: float = 0.0
|
||
|
|
|
||
|
|
"""Recognition confidence: how confident about the text content."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrElement:
|
||
|
|
"""A unified OCR element representing detected text with full metadata."""
|
||
|
|
|
||
|
|
text: str = ""
|
||
|
|
|
||
|
|
"""The recognized text content."""
|
||
|
|
|
||
|
|
geometry: OcrBoundingGeometry | None = None
|
||
|
|
|
||
|
|
"""Bounding geometry (rectangle or quadrilateral)."""
|
||
|
|
|
||
|
|
confidence: OcrConfidence | None = None
|
||
|
|
|
||
|
|
"""Confidence scores for detection and recognition."""
|
||
|
|
|
||
|
|
level: OcrElementLevel | str = "line"
|
||
|
|
|
||
|
|
"""Hierarchical level (word, line, block, page)."""
|
||
|
|
|
||
|
|
rotation: OcrRotation | None = None
|
||
|
|
|
||
|
|
"""Rotation information (if detected)."""
|
||
|
|
|
||
|
|
page_number: int = 0
|
||
|
|
|
||
|
|
"""Page number (1-indexed)."""
|
||
|
|
|
||
|
|
parent_id: str | None = None
|
||
|
|
|
||
|
|
"""Parent element ID for hierarchical relationships."""
|
||
|
|
|
||
|
|
backend_metadata: dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
"""Backend-specific metadata that doesn't fit the unified schema."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class OcrElementConfig:
|
||
|
|
"""Configuration for OCR element extraction."""
|
||
|
|
|
||
|
|
include_elements: bool = False
|
||
|
|
|
||
|
|
"""Whether to include OCR elements in the extraction result."""
|
||
|
|
|
||
|
|
min_level: OcrElementLevel | str = "line"
|
||
|
|
|
||
|
|
"""Minimum hierarchical level to include."""
|
||
|
|
|
||
|
|
min_confidence: float = 0.0
|
||
|
|
|
||
|
|
"""Minimum recognition confidence threshold (0.0-1.0)."""
|
||
|
|
|
||
|
|
build_hierarchy: bool = False
|
||
|
|
|
||
|
|
"""Whether to build hierarchical relationships between elements."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class LayoutRegion:
|
||
|
|
"""A detected layout region on a page."""
|
||
|
|
|
||
|
|
class_name: str = ""
|
||
|
|
|
||
|
|
"""Layout class name (e.g. "picture", "table", "text", "section_header")."""
|
||
|
|
|
||
|
|
confidence: float = 0.0
|
||
|
|
|
||
|
|
"""Confidence score from the layout detection model (0.0 to 1.0)."""
|
||
|
|
|
||
|
|
bounding_box: BoundingBox | None = None
|
||
|
|
|
||
|
|
"""Bounding box in document coordinate space."""
|
||
|
|
|
||
|
|
area_fraction: float = 0.0
|
||
|
|
|
||
|
|
"""Fraction of the page area covered by this region (0.0 to 1.0)."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class RevisionDelta:
|
||
|
|
"""The content changes that make up a single revision."""
|
||
|
|
|
||
|
|
content: list[DiffLine] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Line-level content changes for this revision."""
|
||
|
|
|
||
|
|
table_changes: list[CellChange] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Cell-level table changes for this revision."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class Table:
|
||
|
|
"""Extracted table structure."""
|
||
|
|
|
||
|
|
cells: list[list[str]] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""Table cells as a 2D vector (rows x columns)"""
|
||
|
|
|
||
|
|
markdown: str = ""
|
||
|
|
|
||
|
|
"""Markdown representation of the table"""
|
||
|
|
|
||
|
|
page_number: int = 0
|
||
|
|
|
||
|
|
"""Page number where the table was found (1-indexed)"""
|
||
|
|
|
||
|
|
bounding_box: BoundingBox | None = None
|
||
|
|
|
||
|
|
"""Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top). Only populated for PDF-extracted tables when position data is available."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class TableCell:
|
||
|
|
"""Individual table cell with content and optional styling."""
|
||
|
|
|
||
|
|
content: str = ""
|
||
|
|
|
||
|
|
"""Cell content as text"""
|
||
|
|
|
||
|
|
row_span: int = 0
|
||
|
|
|
||
|
|
"""Row span (number of rows this cell spans)"""
|
||
|
|
|
||
|
|
col_span: int = 0
|
||
|
|
|
||
|
|
"""Column span (number of columns this cell spans)"""
|
||
|
|
|
||
|
|
is_header: bool = False
|
||
|
|
|
||
|
|
"""Whether this is a header cell"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class DiffOptions:
|
||
|
|
"""Options controlling how two `ExtractionResult` values are compared."""
|
||
|
|
|
||
|
|
include_metadata: bool = True
|
||
|
|
|
||
|
|
"""Include metadata changes in the diff. Default: `true`."""
|
||
|
|
|
||
|
|
include_embedded: bool = True
|
||
|
|
|
||
|
|
"""Include embedded-children changes in the diff. Default: `true`."""
|
||
|
|
|
||
|
|
max_content_chars: int | None = None
|
||
|
|
|
||
|
|
"""Truncate content to this many characters before diffing."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class YakeParams:
|
||
|
|
"""YAKE-specific parameters."""
|
||
|
|
|
||
|
|
window_size: int = 2
|
||
|
|
|
||
|
|
"""Window size for co-occurrence analysis (default: 2)."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class RakeParams:
|
||
|
|
"""RAKE-specific parameters."""
|
||
|
|
|
||
|
|
min_word_length: int = 1
|
||
|
|
|
||
|
|
"""Minimum word length to consider (default: 1)."""
|
||
|
|
|
||
|
|
max_words_per_phrase: int = 3
|
||
|
|
|
||
|
|
"""Maximum words in a keyword phrase (default: 3)."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class KeywordConfig:
|
||
|
|
"""Keyword extraction configuration."""
|
||
|
|
|
||
|
|
algorithm: KeywordAlgorithm | str = "yake"
|
||
|
|
|
||
|
|
"""Algorithm to use for extraction."""
|
||
|
|
|
||
|
|
max_keywords: int = 10
|
||
|
|
|
||
|
|
"""Maximum number of keywords to extract (default: 10)."""
|
||
|
|
|
||
|
|
min_score: float = 0
|
||
|
|
|
||
|
|
"""Minimum score threshold (0.0-1.0, default: 0.0)."""
|
||
|
|
|
||
|
|
ngram_range: list[int] = field(default_factory=list)
|
||
|
|
|
||
|
|
"""N-gram range for keyword extraction (min, max)."""
|
||
|
|
|
||
|
|
language: str | None = None
|
||
|
|
|
||
|
|
"""Language code for stopword filtering (e.g., "en", "de", "fr")."""
|
||
|
|
|
||
|
|
yake_params: YakeParams | None = None
|
||
|
|
|
||
|
|
"""YAKE-specific tuning parameters."""
|
||
|
|
|
||
|
|
rake_params: RakeParams | None = None
|
||
|
|
|
||
|
|
"""RAKE-specific tuning parameters."""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PaddleOcrConfig:
|
||
|
|
"""Configuration for PaddleOCR backend."""
|
||
|
|
|
||
|
|
language: str = ""
|
||
|
|
|
||
|
|
"""Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")"""
|
||
|
|
|
||
|
|
cache_dir: str | None = None
|
||
|
|
|
||
|
|
"""Optional custom cache directory for model files"""
|
||
|
|
|
||
|
|
use_angle_cls: bool = False
|
||
|
|
|
||
|
|
"""Enable angle classification for rotated text (default: false). Can misfire on short text regions, rotating crops incorrectly before recognition."""
|
||
|
|
|
||
|
|
enable_table_detection: bool = False
|
||
|
|
|
||
|
|
"""Enable table structure detection (default: false)"""
|
||
|
|
|
||
|
|
det_db_thresh: float = 0.0
|
||
|
|
|
||
|
|
"""Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more confident detections"""
|
||
|
|
|
||
|
|
det_db_box_thresh: float = 0.0
|
||
|
|
|
||
|
|
"""Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0"""
|
||
|
|
|
||
|
|
det_db_unclip_ratio: float = 0.0
|
||
|
|
|
||
|
|
"""Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of detected text regions"""
|
||
|
|
|
||
|
|
det_limit_side_len: int = 0
|
||
|
|
|
||
|
|
"""Maximum side length for detection image (default: 960) Larger images may be resized to this limit for faster inference"""
|
||
|
|
|
||
|
|
rec_batch_num: int = 0
|
||
|
|
|
||
|
|
"""Batch size for recognition inference (default: 6) Number of text regions to process simultaneously"""
|
||
|
|
|
||
|
|
padding: int = 0
|
||
|
|
|
||
|
|
"""Padding in pixels added around the image before detection (default: 10). Large values can include surrounding content like table gridlines."""
|
||
|
|
|
||
|
|
drop_score: float = 0.0
|
||
|
|
|
||
|
|
"""Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition confidence below this threshold are discarded. Matches PaddleOCR Python's `drop_score` parameter. Range: 0.0-1.0"""
|
||
|
|
|
||
|
|
model_tier: str = ""
|
||
|
|
|
||
|
|
"""Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents"""
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True, slots=True)
|
||
|
|
class PdfMetadata:
|
||
|
|
"""PDF-specific metadata."""
|
||
|
|
|
||
|
|
pdf_version: str | None = None
|
||
|
|
|
||
|
|
"""PDF version (e.g., "1.7", "2.0")"""
|
||
|
|
|
||
|
|
producer: str | None = None
|
||
|
|
|
||
|
|
"""PDF producer (application that created the PDF)"""
|
||
|
|
|
||
|
|
is_encrypted: bool | None = None
|
||
|
|
|
||
|
|
"""Whether the PDF is encrypted/password-protected"""
|
||
|
|
|
||
|
|
width: int | None = None
|
||
|
|
|
||
|
|
"""First page width in points (1/72 inch)"""
|
||
|
|
|
||
|
|
height: int | None = None
|
||
|
|
|
||
|
|
"""First page height in points (1/72 inch)"""
|
||
|
|
|
||
|
|
page_count: int | None = None
|
||
|
|
|
||
|
|
"""Total number of pages in the PDF document"""
|
||
|
|
|
||
|
|
|
||
|
|
# Output format for extraction results.
|
||
|
|
|
||
|
|
OutputFormat = str
|
||
|
|
# How chunk size is measured.
|
||
|
|
|
||
|
|
ChunkSizing = str | str | None
|
||
|
|
# Embedding model types supported by Kreuzberg.
|
||
|
|
|
||
|
|
EmbeddingModelType = str | int | LlmConfig
|
||
|
|
# Format-specific metadata (discriminated union).
|
||
|
|
|
||
|
|
FormatMetadata = (
|
||
|
|
|
||
|
|
PdfMetadata | DocxMetadata | ExcelMetadata | EmailMetadata | PptxMetadata | ArchiveMetadata | ImageMetadata | XmlMetadata | TextMetadata | HtmlMetadata | OcrMetadata | CsvMetadata | BibtexMetadata | CitationMetadata | FictionBookMetadata | DbfMetadata | JatsMetadata | EpubMetadata | PstMetadata | str)
|
||
|
|
|
||
|
|
# Bounding geometry for an OCR element.
|
||
|
|
|
||
|
|
OcrBoundingGeometry = int | str
|
||
|
|
# A single line in a unified-diff hunk.
|
||
|
|
|
||
|
|
DiffLine = str
|