packages/python/kreuzberg/options.py

# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""Configuration options for the conversion API."""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Any, TypedDict

from ._kreuzberg import (
    ChunkerType,  # noqa: TC001
    CodeContentMode,  # noqa: TC001
    ExecutionProviderType,  # noqa: TC001
    ExtractionMethod,  # noqa: TC001
    HtmlTheme,  # noqa: TC001
    KeywordAlgorithm,  # noqa: TC001
    OcrElementLevel,  # noqa: TC001
    ReductionLevel,  # noqa: TC001
    ResultFormat,  # noqa: TC001
    TableModel,  # noqa: TC001
    TextDirection,  # noqa: TC001
)

if TYPE_CHECKING:
    from ._kreuzberg import (
        ArchiveEntry,  # noqa: F401
        CellChange,  # noqa: F401
        Chunk,  # noqa: F401
        ContributorRole,  # noqa: F401
        DbfFieldInfo,  # noqa: F401
        DjotContent,  # noqa: F401
        DocumentNode,  # noqa: F401
        DocumentRelationship,  # noqa: F401
        DocumentRevision,  # noqa: F401
        Element,  # noqa: F401
        ErrorMetadata,  # noqa: F401
        ExtractedImage,  # noqa: F401
        ExtractedUri,  # noqa: F401
        GridCell,  # noqa: F401
        HeaderMetadata,  # noqa: F401
        ImageMetadataType,  # noqa: F401
        ImagePreprocessingMetadata,  # noqa: F401
        Keyword,  # noqa: F401
        LinkMetadata,  # noqa: F401
        OcrPipelineConfig,  # noqa: F401
        OcrRotation,  # noqa: F401
        PageContent,  # noqa: F401
        PageStructure,  # noqa: F401
        PdfAnnotation,  # noqa: F401
        ProcessingWarning,  # noqa: F401
        StructuredData,  # noqa: F401
        StructuredExtractionConfig,  # noqa: F401
        YearRange,  # noqa: F401
    )


class ListType(str, Enum):
    """Type of list detection."""

    bullet = "bullet"

    numbered = "numbered"

    lettered = "lettered"

    indented = "indented"


class OcrBackendType(str, Enum):
    """OCR backend types."""

    tesseract = "tesseract"

    easy_ocr = "easy_ocr"

    paddle_ocr = "paddle_ocr"

    custom = "custom"


class ProcessingStage(str, Enum):
    """Processing stages for post-processors."""

    early = "early"

    middle = "middle"

    late = "late"


class PdfAnnotationType(str, Enum):
    """Type of PDF annotation."""

    text = "text"

    highlight = "highlight"

    link = "link"

    stamp = "stamp"

    underline = "underline"

    strike_out = "strike_out"

    other = "other"


class BlockType(str, Enum):
    """Types of block-level elements in Djot."""

    paragraph = "paragraph"

    heading = "heading"

    blockquote = "blockquote"

    code_block = "code_block"

    list_item = "list_item"

    ordered_list = "ordered_list"

    bullet_list = "bullet_list"

    task_list = "task_list"

    definition_list = "definition_list"

    definition_term = "definition_term"

    definition_description = "definition_description"

    div = "div"

    section = "section"

    thematic_break = "thematic_break"

    raw_block = "raw_block"

    math_display = "math_display"


class InlineType(str, Enum):
    """Types of inline elements in Djot."""

    text = "text"

    strong = "strong"

    emphasis = "emphasis"

    highlight = "highlight"

    subscript = "subscript"

    superscript = "superscript"

    insert = "insert"

    delete = "delete"

    code = "code"

    link = "link"

    image = "image"

    span = "span"

    math = "math"

    raw_inline = "raw_inline"

    footnote_ref = "footnote_ref"

    symbol = "symbol"


class RelationshipKind(str, Enum):
    """Semantic kind of a relationship between document elements."""

    footnote_reference = "footnote_reference"

    citation_reference = "citation_reference"

    internal_link = "internal_link"

    caption = "caption"

    label = "label"

    toc_entry = "toc_entry"

    cross_reference = "cross_reference"


class ContentLayer(str, Enum):
    """Content layer classification for document nodes."""

    body = "body"

    header = "header"

    footer = "footer"

    footnote = "footnote"


class ChunkType(str, Enum):
    """Semantic structural classification of a text chunk."""

    heading = "heading"

    party_list = "party_list"

    definitions = "definitions"

    operative_clause = "operative_clause"

    signature_block = "signature_block"

    schedule = "schedule"

    table_like = "table_like"

    formula = "formula"

    code_block = "code_block"

    image = "image"

    org_chart = "org_chart"

    diagram = "diagram"

    unknown = "unknown"


class ImageKind(str, Enum):
    """Heuristic classification of what an image likely depicts."""

    photograph = "photograph"

    diagram = "diagram"

    chart = "chart"

    drawing = "drawing"

    text_block = "text_block"

    decoration = "decoration"

    logo = "logo"

    icon = "icon"

    tile_fragment = "tile_fragment"

    mask = "mask"

    page_raster = "page_raster"

    unknown = "unknown"


class ElementType(str, Enum):
    """Semantic element type classification."""

    title_ = "title"

    narrative_text = "narrative_text"

    heading = "heading"

    list_item = "list_item"

    table = "table"

    image = "image"

    page_break = "page_break"

    code_block = "code_block"

    block_quote = "block_quote"

    footer = "footer"

    header = "header"


class LinkType(str, Enum):
    """Link type classification."""

    anchor = "anchor"

    internal = "internal"

    external = "external"

    email = "email"

    phone = "phone"

    other = "other"


class ImageType(str, Enum):
    """Image type classification."""

    data_uri = "data-uri"

    inline_svg = "inline-svg"

    external = "external"

    relative = "relative"


class StructuredDataType(str, Enum):
    """Structured data type classification."""

    json_ld = "json-ld"

    microdata = "microdata"

    rd_fa = "rdfa"


class PageUnitType(str, Enum):
    """Type of paginated unit in a document."""

    page = "page"

    slide = "slide"

    sheet = "sheet"


class RevisionKind(str, Enum):
    """Semantic classification of a tracked change."""

    insertion = "insertion"

    deletion = "deletion"

    format_change = "format_change"

    comment = "comment"


class UriKind(str, Enum):
    """Semantic classification of an extracted URI."""

    hyperlink = "hyperlink"

    image = "image"

    anchor = "anchor"

    citation = "citation"

    reference = "reference"

    email = "email"


class PSMMode(str, Enum):
    """Page Segmentation Mode for Tesseract OCR."""

    osd_only = "osd_only"

    auto_osd = "auto_osd"

    auto_only = "auto_only"

    auto = "auto"

    single_column = "single_column"

    single_block_vertical = "single_block_vertical"

    single_block = "single_block"

    single_line = "single_line"

    single_word = "single_word"

    circle_word = "circle_word"

    single_char = "single_char"


class PaddleLanguage(str, Enum):
    """Supported languages in PaddleOCR."""

    english = "english"

    chinese = "chinese"

    japanese = "japanese"

    korean = "korean"

    german = "german"

    french = "french"

    latin = "latin"

    cyrillic = "cyrillic"

    traditional_chinese = "traditional_chinese"

    thai = "thai"

    greek = "greek"

    east_slavic = "east_slavic"

    arabic = "arabic"

    devanagari = "devanagari"

    tamil = "tamil"

    telugu = "telugu"


class LayoutClass(str, Enum):
    """The 17 canonical document layout classes."""

    caption = "caption"

    footnote = "footnote"

    formula = "formula"

    list_item = "list_item"

    page_footer = "page_footer"

    page_header = "page_header"

    picture = "picture"

    section_header = "section_header"

    table = "table"

    text = "text"

    title_ = "title"

    document_index = "document_index"

    code = "code"

    checkbox_selected = "checkbox_selected"

    checkbox_unselected = "checkbox_unselected"

    form = "form"

    key_value_region = "key_value_region"


@dataclass(frozen=True, slots=True)
class AccelerationConfig:
    """Hardware acceleration configuration for ONNX Runtime models."""

    provider: ExecutionProviderType | str = "auto"

    """Execution provider to use for ONNX inference."""

    device_id: int = 0

    """GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto."""


@dataclass(frozen=True, slots=True)
class ContentFilterConfig:
    """Cross-extractor content filtering configuration."""

    include_headers: bool = False

    """Include running headers in extraction output."""

    include_footers: bool = False

    """Include running footers in extraction output."""

    strip_repeating_text: bool = True

    """Enable the heuristic cross-page repeating text detector."""

    include_watermarks: bool = False

    """Include watermark text in extraction output."""


@dataclass(frozen=True, slots=True)
class EmailConfig:
    """Configuration for email extraction."""

    msg_fallback_codepage: int | None = None

    """Windows codepage number to use when an MSG file contains no codepage property. Defaults to `None`, which falls back to windows-1252."""


class ExtractionConfig(TypedDict, total=False):
    """Main extraction configuration."""
    use_cache: bool

    """Enable caching of extraction results"""
    enable_quality_processing: bool

    """Enable quality post-processing"""
    ocr: OcrConfig | None

    """OCR configuration (None = OCR disabled)"""
    force_ocr: bool

    """Force OCR even for searchable PDFs"""
    force_ocr_pages: list[int] | None

    """Force OCR on specific pages only (1-indexed page numbers, must be >= 1)."""
    disable_ocr: bool

    """Disable OCR entirely, even for images."""
    chunking: ChunkingConfig | None

    """Text chunking configuration (None = chunking disabled)"""
    content_filter: ContentFilterConfig | None

    """Content filtering configuration (None = use extractor defaults)."""
    images: ImageExtractionConfig | None

    """Image extraction configuration (None = no image extraction)"""
    pdf_options: PdfConfig | None

    """PDF-specific options (None = use defaults)"""
    token_reduction: TokenReductionOptions | None

    """Token reduction configuration (None = no token reduction)"""
    language_detection: LanguageDetectionConfig | None

    """Language detection configuration (None = no language detection)"""
    pages: PageConfig | None

    """Page extraction configuration (None = no page tracking)"""
    keywords: KeywordConfig | None

    """Keyword extraction configuration (None = no keyword extraction)"""
    postprocessor: PostProcessorConfig | None

    """Post-processor configuration (None = use defaults)"""
    html_options: str | None

    """HTML to Markdown conversion options (None = use defaults)"""
    html_output: HtmlOutputConfig | None

    """Styled HTML output configuration."""
    extraction_timeout_secs: int | None

    """Default per-file timeout in seconds for batch extraction."""
    max_concurrent_extractions: int | None

    """Maximum concurrent extractions in batch operations (None = (num_cpus x 1.5).ceil())."""
    result_format: ResultFormat | str

    """Result structure format"""
    security_limits: SecurityLimits | None

    """Security limits for archive extraction."""
    max_embedded_file_bytes: int | None

    """Maximum uncompressed size in bytes for a single embedded file before recursive extraction is attempted (default: 50 MiB)."""
    output_format: OutputFormat

    """Content text format (default: Plain)."""
    layout: LayoutDetectionConfig | None

    """Layout detection configuration (None = layout detection disabled)."""
    use_layout_for_markdown: bool

    """Run layout detection on the non-OCR PDF markdown path."""
    include_document_structure: bool

    """Enable structured document tree output."""
    acceleration: AccelerationConfig | None

    """Hardware acceleration configuration for ONNX Runtime models."""
    cache_namespace: str | None

    """Cache namespace for tenant isolation."""
    cache_ttl_secs: int | None

    """Per-request cache TTL in seconds."""
    email: EmailConfig | None

    """Email extraction configuration (None = use defaults)."""
    concurrency: str | None

    """Concurrency limits for constrained environments (None = use defaults)."""
    max_archive_depth: int

    """Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive extraction (legacy behavior)."""
    tree_sitter: TreeSitterConfig | None

    """Tree-sitter language pack configuration (None = tree-sitter disabled)."""
    structured_extraction: StructuredExtractionConfig | None

    """Structured extraction via LLM (None = disabled)."""
    cancel_token: str | None

    """Cancellation token for this extraction (None = no external cancellation)."""

@dataclass(frozen=True, slots=True)
class FileExtractionConfig:
    """Per-file extraction configuration overrides for batch processing."""

    enable_quality_processing: bool | None = None

    """Override quality post-processing for this file."""

    ocr: OcrConfig | None = None

    """Override OCR configuration for this file (None in the Option = use batch default)."""

    force_ocr: bool | None = None

    """Override force OCR for this file."""

    force_ocr_pages: list[int] | None = None

    """Override force OCR pages for this file (1-indexed page numbers)."""

    disable_ocr: bool | None = None

    """Override disable OCR for this file."""

    chunking: ChunkingConfig | None = None

    """Override chunking configuration for this file."""

    content_filter: ContentFilterConfig | None = None

    """Override content filtering configuration for this file."""

    images: ImageExtractionConfig | None = None

    """Override image extraction configuration for this file."""

    pdf_options: PdfConfig | None = None

    """Override PDF options for this file."""

    token_reduction: TokenReductionOptions | None = None

    """Override token reduction for this file."""

    language_detection: LanguageDetectionConfig | None = None

    """Override language detection for this file."""

    pages: PageConfig | None = None

    """Override page extraction for this file."""

    keywords: KeywordConfig | None = None

    """Override keyword extraction for this file."""

    postprocessor: PostProcessorConfig | None = None

    """Override post-processor for this file."""

    html_options: str | None = None

    """Override HTML conversion options for this file."""

    result_format: ResultFormat | str | None = None

    """Override result format for this file."""

    output_format: OutputFormat | None = None

    """Override output content format for this file."""

    include_document_structure: bool | None = None

    """Override document structure output for this file."""

    layout: LayoutDetectionConfig | None = None

    """Override layout detection for this file."""

    timeout_secs: int | None = None

    """Override per-file extraction timeout in seconds."""

    tree_sitter: TreeSitterConfig | None = None

    """Override tree-sitter configuration for this file."""

    structured_extraction: StructuredExtractionConfig | None = None

    """Override structured extraction configuration for this file."""


@dataclass(frozen=True, slots=True)
class ImageExtractionConfig:
    """Image extraction configuration."""

    extract_images: bool = True

    """Extract images from documents"""

    target_dpi: int = 300

    """Target DPI for image normalization"""

    max_image_dimension: int = 4096

    """Maximum dimension for images (width or height)"""

    inject_placeholders: bool = True

    """Whether to inject image reference placeholders into markdown output. When `true` (default), image references like `![Image 1](embedded:p1_i0)` are appended to the markdown. Set to `false` to extract images as data without polluting the markdown output."""

    auto_adjust_dpi: bool = True

    """Automatically adjust DPI based on image content"""

    min_dpi: int = 72

    """Minimum DPI threshold"""

    max_dpi: int = 600

    """Maximum DPI threshold"""

    max_images_per_page: int | None = None

    """Maximum number of image objects to extract per PDF page."""

    classify: bool = True

    """When `true` (default), extracted images are classified by kind and grouped into clusters where they appear to belong to one figure."""

    include_page_rasters: bool = False

    """When `true`, full-page renders produced during OCR preprocessing are captured and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`."""

    run_ocr_on_images: bool = True

    """Run OCR on extracted images and include the recognized text in the document content."""

    ocr_text_only: bool = False

    """When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images` is also `true`."""

    append_ocr_text: bool = False

    """When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder in the rendered output."""


@dataclass(frozen=True, slots=True)
class TokenReductionOptions:
    """Token reduction configuration."""

    mode: str = ""

    """Reduction mode: "off", "light", "moderate", "aggressive", "maximum" """

    preserve_important_words: bool = True

    """Preserve important words (capitalized, technical terms)"""


@dataclass(frozen=True, slots=True)
class LanguageDetectionConfig:
    """Language detection configuration."""

    enabled: bool = True

    """Enable language detection"""

    min_confidence: float = 0.8

    """Minimum confidence threshold (0.0-1.0)"""

    detect_multiple: bool = False

    """Detect multiple languages in the document"""


@dataclass(frozen=True, slots=True)
class HtmlOutputConfig:
    """Configuration for styled HTML output."""

    css: str | None = None

    """Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file` content when both are set."""

    css_file: str | None = None

    """Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both are set."""

    theme: HtmlTheme | str = "unstyled"

    """Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]."""

    class_prefix: str = ""

    """CSS class prefix applied to every emitted class name."""

    embed_css: bool = True

    """When `true` (default), write the resolved CSS into a `<style>` block immediately after the opening `<div class="{prefix}doc">`."""


@dataclass(frozen=True, slots=True)
class LayoutDetectionConfig:
    """Layout detection configuration."""

    confidence_threshold: float | None = None

    """Confidence threshold override (None = use model default)."""

    apply_heuristics: bool = True

    """Whether to apply postprocessing heuristics (default: true)."""

    table_model: TableModel | str = "tatr"

    """Table structure recognition model."""

    acceleration: AccelerationConfig | None = None

    """Hardware acceleration for ONNX models (layout detection + table structure)."""


@dataclass(frozen=True, slots=True)
class LlmConfig:
    """Configuration for an LLM provider/model via liter-llm."""

    model: str = ""

    """Provider/model string using liter-llm routing format."""

    api_key: str | None = None

    """API key for the provider. When `None`, liter-llm falls back to the provider's standard environment variable (e.g., `OPENAI_API_KEY`)."""

    base_url: str | None = None

    """Custom base URL override for the provider endpoint."""

    timeout_secs: int | None = None

    """Request timeout in seconds (default: 60)."""

    max_retries: int | None = None

    """Maximum retry attempts (default: 3)."""

    temperature: float | None = None

    """Sampling temperature for generation tasks."""

    max_tokens: int | None = None

    """Maximum tokens to generate."""


@dataclass(frozen=True, slots=True)
class OcrQualityThresholds:
    """Quality thresholds for OCR fallback decisions and pipeline quality gating."""

    min_total_non_whitespace: int = 64

    """Minimum total non-whitespace characters to consider text substantive."""

    min_non_whitespace_per_page: float = 32

    """Minimum non-whitespace characters per page on average."""

    min_meaningful_word_len: int = 4

    """Minimum character count for a word to be "meaningful"."""

    min_meaningful_words: int = 3

    """Minimum count of meaningful words before text is accepted."""

    min_alnum_ratio: float = 0.3

    """Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric)."""

    min_garbage_chars: int = 5

    """Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback."""

    max_fragmented_word_ratio: float = 0.6

    """Maximum fraction of short (1-2 char) words before text is considered fragmented."""

    critical_fragmented_word_ratio: float = 0.8

    """Critical fragmentation threshold -- triggers OCR regardless of meaningful words. Normal English text has ~20-30% short words. 80%+ is definitive garbage."""

    min_avg_word_length: float = 2

    """Minimum average word length. Below this with enough words indicates garbled extraction."""

    min_words_for_avg_length_check: int = 50

    """Minimum word count before average word length check applies."""

    min_consecutive_repeat_ratio: float = 0.08

    """Minimum consecutive word repetition ratio to detect column scrambling."""

    min_words_for_repeat_check: int = 50

    """Minimum word count before consecutive repetition check is applied."""

    substantive_min_chars: int = 100

    """Minimum character count for "substantive markdown" OCR skip gate."""

    non_text_min_chars: int = 20

    """Minimum character count for "non-text content" OCR skip gate."""

    alnum_ws_ratio_threshold: float = 0.4

    """Alphanumeric+whitespace ratio threshold for skip decisions."""

    pipeline_min_quality: float = 0.5

    """Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted. If the result from a backend scores below this, try the next backend."""


@dataclass(frozen=True, slots=True)
class OcrConfig:
    """OCR configuration."""

    enabled: bool = True

    """Whether OCR is enabled."""

    backend: str = ""

    """OCR backend: tesseract, easyocr, paddleocr"""

    language: str = ""

    """Language code (e.g., "eng", "deu")"""

    tesseract_config: TesseractConfig | None = None

    """Tesseract-specific configuration (optional)"""

    output_format: OutputFormat | None = None

    """Output format for OCR results (optional, for format conversion)"""

    paddle_ocr_config: str | None = None

    """PaddleOCR-specific configuration (optional, JSON passthrough)"""

    backend_options: str | None = None

    """Arbitrary per-call options passed through to the backend unchanged."""

    element_config: OcrElementConfig | None = None

    """OCR element extraction configuration"""

    quality_thresholds: OcrQualityThresholds | None = None

    """Quality thresholds for the native-text-to-OCR fallback decision. When None, uses compiled defaults (matching previous hardcoded behavior)."""

    pipeline: OcrPipelineConfig | None = None

    """Multi-backend OCR pipeline configuration. When set, enables weighted fallback across multiple OCR backends based on output quality. When None, uses the single `backend` field (same as today)."""

    auto_rotate: bool = False

    """Enable automatic page rotation based on orientation detection."""

    vlm_config: LlmConfig | None = None

    """VLM (Vision Language Model) OCR configuration."""

    vlm_prompt: str | None = None

    """Custom Jinja2 prompt template for VLM OCR."""

    acceleration: AccelerationConfig | None = None

    """Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection)."""

    tessdata_bytes: dict[str, bytes] | None = None

    """Caller-supplied Tesseract `traineddata` bytes per language code."""


@dataclass(frozen=True, slots=True)
class PageConfig:
    """Page extraction and tracking configuration."""

    extract_pages: bool = False

    """Extract pages as separate array (ExtractionResult.pages)"""

    insert_page_markers: bool = False

    """Insert page markers in main content string"""

    marker_format: str = "\n\n<!-- PAGE {page_num} -->\n\n"

    """Page marker format (use {page_num} placeholder) Default: "\n\n<!-- PAGE {page_num} -->\n\n" """


@dataclass(frozen=True, slots=True)
class PdfConfig:
    """PDF-specific configuration."""

    extract_images: bool = False

    """Extract images from PDF"""

    extract_tables: bool = True

    """Extract tables from PDF."""

    passwords: list[str] | None = None

    """List of passwords to try when opening encrypted PDFs"""

    extract_metadata: bool = True

    """Extract PDF metadata"""

    hierarchy: HierarchyConfig | None = None

    """Hierarchy extraction configuration (None = hierarchy extraction disabled)"""

    extract_annotations: bool = False

    """Extract PDF annotations (text notes, highlights, links, stamps). Default: false"""

    top_margin_fraction: float | None = None

    """Top margin fraction (0.0-1.0) of page height to exclude headers/running heads. Default: 0.06 (6%)"""

    bottom_margin_fraction: float | None = None

    """Bottom margin fraction (0.0-1.0) of page height to exclude footers/page numbers. Default: 0.05 (5%)"""

    allow_single_column_tables: bool = False

    """Allow single-column pseudo tables in extraction results."""

    ocr_inline_images: bool = False

    """Perform OCR on inline images extracted from PDF pages and attach the recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract to be available; if `ExtractionConfig.ocr` is `None` the extractor falls back to `TesseractConfig::default()`. Per-image failures degrade gracefully (the image is returned without OCR text rather than failing the whole extraction). Default: `false`."""


@dataclass(frozen=True, slots=True)
class HierarchyConfig:
    """Hierarchy extraction configuration for PDF text structure analysis."""

    enabled: bool = True

    """Enable hierarchy extraction"""

    k_clusters: int = 3

    """Number of font size clusters to use for hierarchy levels (1-7)"""

    include_bbox: bool = True

    """Include bounding box information in hierarchy blocks"""

    ocr_coverage_threshold: float | None = None

    """OCR coverage threshold for smart OCR triggering (0.0-1.0)"""


@dataclass(frozen=True, slots=True)
class PostProcessorConfig:
    """Post-processor configuration."""

    enabled: bool = True

    """Enable post-processors"""

    enabled_processors: list[str] | None = None

    """Whitelist of processor names to run (None = all enabled)"""

    disabled_processors: list[str] | None = None

    """Blacklist of processor names to skip (None = none disabled)"""

    enabled_set: list[str] | None = None

    """Pre-computed AHashSet for O(1) enabled processor lookup"""

    disabled_set: list[str] | None = None

    """Pre-computed AHashSet for O(1) disabled processor lookup"""


@dataclass(frozen=True, slots=True)
class ChunkingConfig:
    """Chunking configuration."""

    max_characters: int = 1000

    """Maximum size per chunk (in units determined by `sizing`)."""

    overlap: int = 200

    """Overlap between chunks (in units determined by `sizing`)."""

    trim: bool = True

    """Whether to trim whitespace from chunk boundaries."""

    chunker_type: ChunkerType | str = "text"

    """Type of chunker to use (Text or Markdown)."""

    embedding: EmbeddingConfig | None = None

    """Optional embedding configuration for chunk embeddings."""

    preset: str | None = None

    """Use a preset configuration (overrides individual settings if provided)."""

    sizing: ChunkSizing | None = None

    """How to measure chunk size."""

    prepend_heading_context: bool = False

    """When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string."""

    topic_threshold: float | None = None

    """Optional cosine similarity threshold for semantic topic boundary detection."""


@dataclass(frozen=True, slots=True)
class EmbeddingConfig:
    """Embedding configuration for text chunks."""

    model: EmbeddingModelType | None = None

    """The embedding model to use (defaults to "balanced" preset if not specified)"""

    normalize: bool = True

    """Whether to normalize embedding vectors (recommended for cosine similarity)"""

    batch_size: int = 32

    """Batch size for embedding generation"""

    show_download_progress: bool = False

    """Show model download progress"""

    cache_dir: str | None = None

    """Custom cache directory for model files"""

    acceleration: AccelerationConfig | None = None

    """Hardware acceleration for the embedding ONNX model."""

    max_embed_duration_secs: int | None = None

    """Maximum wall-clock duration (in seconds) for a single `embed()` call when using [`EmbeddingModelType::Plugin`]."""


@dataclass(frozen=True, slots=True)
class TreeSitterConfig:
    """Configuration for tree-sitter language pack integration."""

    enabled: bool = True

    """Enable code intelligence processing (default: true)."""

    cache_dir: str | None = None

    """Custom cache directory for downloaded grammars."""

    languages: list[str] | None = None

    """Languages to pre-download on init (e.g., `["python", "rust"]`)."""

    groups: list[str] | None = None

    """Language groups to pre-download (e.g., `["web", "systems", "scripting"]`)."""

    process: TreeSitterProcessConfig | None = None

    """Processing options for code analysis."""


@dataclass(frozen=True, slots=True)
class TreeSitterProcessConfig:
    """Processing options for tree-sitter code analysis."""

    structure: bool = True

    """Extract structural items (functions, classes, structs, etc.). Default: true."""

    imports: bool = True

    """Extract import statements. Default: true."""

    exports: bool = True

    """Extract export statements. Default: true."""

    comments: bool = False

    """Extract comments. Default: false."""

    docstrings: bool = False

    """Extract docstrings. Default: false."""

    symbols: bool = False

    """Extract symbol definitions. Default: false."""

    diagnostics: bool = False

    """Include parse diagnostics. Default: false."""

    chunk_max_size: int | None = None

    """Maximum chunk size in bytes. `None` disables chunking."""

    content_mode: CodeContentMode | str = "chunks"

    """Content rendering mode for code extraction."""


@dataclass(frozen=True, slots=True)
class ServerConfig:
    """API server configuration."""

    host: str = ""

    """Server host address (e.g., "127.0.0.1", "0.0.0.0")"""

    port: int = 0

    """Server port number"""

    cors_origins: list[str] = field(default_factory=list)

    """CORS allowed origins. Empty vector means allow all origins."""

    max_request_body_bytes: int = 0

    """Maximum size of request body in bytes (default: 100 MB)"""

    max_multipart_field_bytes: int = 0

    """Maximum size of multipart fields in bytes (default: 100 MB)"""


@dataclass(frozen=True, slots=True)
class DocxAppProperties:
    """Application properties from docProps/app.xml for DOCX."""

    application: str | None = None

    """Application name (e.g., "Microsoft Office Word")"""

    app_version: str | None = None

    """Application version"""

    template: str | None = None

    """Template filename"""

    total_time: int | None = None

    """Total editing time in minutes"""

    pages: int | None = None

    """Number of pages"""

    words: int | None = None

    """Number of words"""

    characters: int | None = None

    """Number of characters (excluding spaces)"""

    characters_with_spaces: int | None = None

    """Number of characters (including spaces)"""

    lines: int | None = None

    """Number of lines"""

    paragraphs: int | None = None

    """Number of paragraphs"""

    company: str | None = None

    """Company name"""

    doc_security: int | None = None

    """Document security level"""

    scale_crop: bool | None = None

    """Scale crop flag"""

    links_up_to_date: bool | None = None

    """Links up to date flag"""

    shared_doc: bool | None = None

    """Shared document flag"""

    hyperlinks_changed: bool | None = None

    """Hyperlinks changed flag"""


@dataclass(frozen=True, slots=True)
class XlsxAppProperties:
    """Application properties from docProps/app.xml for XLSX."""

    application: str | None = None

    """Application name (e.g., "Microsoft Excel")"""

    app_version: str | None = None

    """Application version"""

    doc_security: int | None = None

    """Document security level"""

    scale_crop: bool | None = None

    """Scale crop flag"""

    links_up_to_date: bool | None = None

    """Links up to date flag"""

    shared_doc: bool | None = None

    """Shared document flag"""

    hyperlinks_changed: bool | None = None

    """Hyperlinks changed flag"""

    company: str | None = None

    """Company name"""

    worksheet_names: list[str] = field(default_factory=list)

    """Worksheet names"""


@dataclass(frozen=True, slots=True)
class PptxAppProperties:
    """Application properties from docProps/app.xml for PPTX."""

    application: str | None = None

    """Application name (e.g., "Microsoft Office PowerPoint")"""

    app_version: str | None = None

    """Application version"""

    total_time: int | None = None

    """Total editing time in minutes"""

    company: str | None = None

    """Company name"""

    doc_security: int | None = None

    """Document security level"""

    scale_crop: bool | None = None

    """Scale crop flag"""

    links_up_to_date: bool | None = None

    """Links up to date flag"""

    shared_doc: bool | None = None

    """Shared document flag"""

    hyperlinks_changed: bool | None = None

    """Hyperlinks changed flag"""

    slides: int | None = None

    """Number of slides"""

    notes: int | None = None

    """Number of notes"""

    hidden_slides: int | None = None

    """Number of hidden slides"""

    multimedia_clips: int | None = None

    """Number of multimedia clips"""

    presentation_format: str | None = None

    """Presentation format (e.g., "Widescreen", "Standard")"""

    slide_titles: list[str] = field(default_factory=list)

    """Slide titles"""


@dataclass(frozen=True, slots=True)
class CoreProperties:
    """Dublin Core metadata from docProps/core.xml."""

    title: str | None = None

    """Document title"""

    subject: str | None = None

    """Document subject/topic"""

    creator: str | None = None

    """Document creator/author"""

    keywords: str | None = None

    """Keywords or tags"""

    description: str | None = None

    """Document description/abstract"""

    last_modified_by: str | None = None

    """User who last modified the document"""

    revision: str | None = None

    """Revision number"""

    created: str | None = None

    """Creation timestamp (ISO 8601)"""

    modified: str | None = None

    """Last modification timestamp (ISO 8601)"""

    category: str | None = None

    """Document category"""

    content_status: str | None = None

    """Content status (Draft, Final, etc.)"""

    language: str | None = None

    """Document language"""

    identifier: str | None = None

    """Unique identifier"""

    version: str | None = None

    """Document version"""

    last_printed: str | None = None

    """Last print timestamp (ISO 8601)"""


@dataclass(frozen=True, slots=True)
class SecurityLimits:
    """Configuration for security limits across extractors."""

    max_archive_size: int = 524288000

    """Maximum uncompressed size for archives (500 MB)"""

    max_compression_ratio: int = 100

    """Maximum compression ratio before flagging as potential bomb (100:1)"""

    max_files_in_archive: int = 10000

    """Maximum number of files in archive (10,000)"""

    max_nesting_depth: int = 1024

    """Maximum nesting depth for structures (100)"""

    max_entity_length: int = 1048576

    """Maximum length of any single XML entity / attribute / token (1 MiB). This is a per-token cap, NOT a total cap -- billion-laughs class attacks where a single entity expands to hundreds of MB are caught here, while normal long text content (a paragraph, a CDATA block) is caught by `max_content_size` instead."""

    max_content_size: int = 104857600

    """Maximum string growth per document (100 MB)"""

    max_iterations: int = 10000000

    """Maximum iterations per operation"""

    max_xml_depth: int = 1024

    """Maximum XML depth (100 levels)"""

    max_table_cells: int = 100000

    """Maximum cells per table (100,000)"""


@dataclass(frozen=True, slots=True)
class TokenReductionConfig:
    """Token reduction config."""

    level: ReductionLevel | str = "moderate"

    language_hint: str | None = None

    preserve_markdown: bool = False

    preserve_code: bool = True

    semantic_threshold: float = 0.3

    enable_parallel: bool = True

    use_simd: bool = True

    custom_stopwords: dict[str, list[str]] | None = None

    preserve_patterns: list[str] = field(default_factory=list)

    target_reduction: float | None = None

    enable_semantic_clustering: bool = False


@dataclass(frozen=True, slots=True)
class DocumentStructure:
    """Top-level structured document representation."""

    nodes: list[DocumentNode] = field(default_factory=list)

    """All nodes in document/reading order."""

    source_format: str | None = None

    """Origin format identifier (e.g. "docx", "pptx", "html", "pdf")."""

    relationships: list[DocumentRelationship] = field(default_factory=list)

    """Resolved relationships between nodes (footnote refs, citations, anchor links, etc.)."""

    node_types: list[str] = field(default_factory=list)

    """Sorted, deduplicated list of node type names present in this document."""


@dataclass(frozen=True, slots=True)
class TableGrid:
    """Structured table grid with cell-level metadata."""

    rows: int = 0

    """Number of rows in the table."""

    cols: int = 0

    """Number of columns in the table."""

    cells: list[GridCell] = field(default_factory=list)

    """All cells in row-major order."""


class ExtractionResult(TypedDict, total=False):
    """General extraction result used by the core extraction API."""
    content: str

    mime_type: str

    metadata: Metadata

    extraction_method: ExtractionMethod | str | None

    """Extraction strategy used to produce the returned text."""
    tables: list[Table]

    detected_languages: list[str] | None

    chunks: list[Chunk] | None

    """Text chunks when chunking is enabled."""
    images: list[ExtractedImage] | None

    """Extracted images from the document."""
    pages: list[PageContent] | None

    """Per-page content when page extraction is enabled."""
    elements: list[Element] | None

    """Semantic elements when element-based result format is enabled."""
    djot_content: DjotContent | None

    """Rich Djot content structure (when extracting Djot documents)."""
    ocr_elements: list[OcrElement] | None

    """OCR elements with full spatial and confidence metadata."""
    document: DocumentStructure | None

    """Structured document tree (when document structure extraction is enabled)."""
    extracted_keywords: list[Keyword] | None

    """Extracted keywords when keyword extraction is enabled."""
    quality_score: float | None

    """Document quality score from quality analysis."""
    processing_warnings: list[ProcessingWarning]

    """Non-fatal warnings collected during processing pipeline stages."""
    annotations: list[PdfAnnotation] | None

    """PDF annotations extracted from the document."""
    children: list[ArchiveEntry] | None

    """Nested extraction results from archive contents."""
    uris: list[ExtractedUri] | None

    """URIs/links discovered during document extraction."""
    revisions: list[DocumentRevision] | None

    """Tracked changes embedded in the source document."""
    structured_output: str | None

    """Structured extraction output from LLM-based JSON schema extraction."""
    code_intelligence: str | None

    """Code intelligence results from tree-sitter analysis."""
    llm_usage: list[LlmUsage] | None

    """LLM token usage and cost data for all LLM calls made during this extraction."""
    formatted_content: str | None

    """Pre-rendered content in the requested output format."""
    ocr_internal_document: str | None

    """Structured hOCR document for the OCR+layout pipeline."""

@dataclass(frozen=True, slots=True)
class LlmUsage:
    """Token usage and cost data for a single LLM call made during extraction."""

    model: str = ""

    """The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514")."""

    source: str = ""

    """The pipeline stage that triggered this LLM call (e.g. "vlm_ocr", "structured_extraction", "embeddings")."""

    input_tokens: int | None = None

    """Number of input/prompt tokens consumed."""

    output_tokens: int | None = None

    """Number of output/completion tokens generated."""

    total_tokens: int | None = None

    """Total tokens (input + output)."""

    estimated_cost: float | None = None

    """Estimated cost in USD based on the provider's published pricing."""

    finish_reason: str | None = None

    """Why the model stopped generating (e.g. "stop", "length", "content_filter")."""


@dataclass(frozen=True, slots=True)
class BoundingBox:
    """Bounding box coordinates for element positioning."""

    x0: float = 0.0

    """Left x-coordinate"""

    y0: float = 0.0

    """Bottom y-coordinate"""

    x1: float = 0.0

    """Right x-coordinate"""

    y1: float = 0.0

    """Top y-coordinate"""


@dataclass(frozen=True, slots=True)
class ImagePreprocessingConfig:
    """Image preprocessing configuration for OCR."""

    target_dpi: int = 300

    """Target DPI for the image (300 is standard, 600 for small text)."""

    auto_rotate: bool = True

    """Auto-detect and correct image rotation."""

    deskew: bool = True

    """Correct skew (tilted images)."""

    denoise: bool = False

    """Remove noise from the image."""

    contrast_enhance: bool = False

    """Enhance contrast for better text visibility."""

    binarization_method: str = "otsu"

    """Binarization method: "otsu", "sauvola", "adaptive"."""

    invert_colors: bool = False

    """Invert colors (white text on black → black on white)."""


@dataclass(frozen=True, slots=True)
class TesseractConfig:
    """Tesseract OCR configuration."""

    language: str = "eng"

    """Language code (e.g., "eng", "deu", "fra")"""

    psm: int = 3

    """Page Segmentation Mode (0-13)."""

    output_format: str = "markdown"

    """Output format ("text" or "markdown")"""

    oem: int = 3

    """OCR Engine Mode (0-3)."""

    min_confidence: float = 0

    """Minimum confidence threshold (0.0-100.0)."""

    preprocessing: ImagePreprocessingConfig | None = None

    """Image preprocessing configuration."""

    enable_table_detection: bool = True

    """Enable automatic table detection and reconstruction"""

    table_min_confidence: float = 0

    """Minimum confidence threshold for table detection (0.0-1.0)"""

    table_column_threshold: int = 50

    """Column threshold for table detection (pixels)"""

    table_row_threshold_ratio: float = 0.5

    """Row threshold ratio for table detection (0.0-1.0)"""

    use_cache: bool = True

    """Enable OCR result caching"""

    classify_use_pre_adapted_templates: bool = True

    """Use pre-adapted templates for character classification"""

    language_model_ngram_on: bool = False

    """Enable N-gram language model"""

    tessedit_dont_blkrej_good_wds: bool = True

    """Don't reject good words during block-level processing"""

    tessedit_dont_rowrej_good_wds: bool = True

    """Don't reject good words during row-level processing"""

    tessedit_enable_dict_correction: bool = True

    """Enable dictionary correction"""

    tessedit_char_whitelist: str = ""

    """Whitelist of allowed characters (empty = all allowed)"""

    tessedit_char_blacklist: str = ""

    """Blacklist of forbidden characters (empty = none forbidden)"""

    tessedit_use_primary_params_model: bool = True

    """Use primary language params model"""

    textord_space_size_is_variable: bool = True

    """Variable-width space detection"""

    thresholding_method: bool = False

    """Use adaptive thresholding method"""


@dataclass(frozen=True, slots=True)
class Metadata:
    """Extraction result metadata."""

    title: str | None = None

    """Document title"""

    subject: str | None = None

    """Document subject or description"""

    authors: list[str] | None = None

    """Primary author(s) - always Vec for consistency"""

    keywords: list[str] | None = None

    """Keywords/tags - always Vec for consistency"""

    language: str | None = None

    """Primary language (ISO 639 code)"""

    created_at: str | None = None

    """Creation timestamp (ISO 8601 format)"""

    modified_at: str | None = None

    """Last modification timestamp (ISO 8601 format)"""

    created_by: str | None = None

    """User who created the document"""

    modified_by: str | None = None

    """User who last modified the document"""

    pages: PageStructure | None = None

    """Page/slide/sheet structure with boundaries"""

    format: FormatMetadata | None = None

    """Format-specific metadata (discriminated union)"""

    image_preprocessing: ImagePreprocessingMetadata | None = None

    """Image preprocessing metadata (when OCR preprocessing was applied)"""

    json_schema: str | None = None

    """JSON schema (for structured data extraction)"""

    error: ErrorMetadata | None = None

    """Error metadata (for batch operations)"""

    extraction_duration_ms: int | None = None

    """Extraction duration in milliseconds (for benchmarking)."""

    category: str | None = None

    """Document category (from frontmatter or classification)."""

    tags: list[str] | None = None

    """Document tags (from frontmatter)."""

    document_version: str | None = None

    """Document version string (from frontmatter)."""

    abstract_text: str | None = None

    """Abstract or summary text (from frontmatter)."""

    output_format: str | None = None

    """Output format identifier (e.g., "markdown", "html", "text")."""

    ocr_used: bool = False

    """Whether OCR was used during extraction."""

    additional: dict[str, str] = field(default_factory=dict)

    """Additional custom fields from postprocessors."""


@dataclass(frozen=True, slots=True)
class ExcelMetadata:
    """Excel/spreadsheet format metadata."""

    sheet_count: int | None = None

    """Number of sheets in the workbook."""

    sheet_names: list[str] | None = None

    """Names of all sheets in the workbook."""


@dataclass(frozen=True, slots=True)
class EmailMetadata:
    """Email metadata extracted from .eml and .msg files."""

    from_email: str | None = None

    """Sender's email address"""

    from_name: str | None = None

    """Sender's display name"""

    to_emails: list[str] = field(default_factory=list)

    """Primary recipients"""

    cc_emails: list[str] = field(default_factory=list)

    """CC recipients"""

    bcc_emails: list[str] = field(default_factory=list)

    """BCC recipients"""

    message_id: str | None = None

    """Message-ID header value"""

    attachments: list[str] = field(default_factory=list)

    """List of attachment filenames"""


@dataclass(frozen=True, slots=True)
class ArchiveMetadata:
    """Archive (ZIP/TAR/7Z) metadata."""

    format: str = ""

    """Archive format ("ZIP", "TAR", "7Z", etc.)"""

    file_count: int = 0

    """Total number of files in the archive"""

    file_list: list[str] = field(default_factory=list)

    """List of file paths within the archive"""

    total_size: int = 0

    """Total uncompressed size in bytes"""

    compressed_size: int | None = None

    """Compressed size in bytes (if available)"""


@dataclass(frozen=True, slots=True)
class ImageMetadata:
    """Image metadata extracted from image files."""

    width: int = 0

    """Image width in pixels"""

    height: int = 0

    """Image height in pixels"""

    format: str = ""

    """Image format (e.g., "PNG", "JPEG", "TIFF")"""

    exif: dict[str, str] = field(default_factory=dict)

    """EXIF metadata tags"""


@dataclass(frozen=True, slots=True)
class XmlMetadata:
    """XML metadata extracted during XML parsing."""

    element_count: int = 0

    """Total number of XML elements processed"""

    unique_elements: list[str] = field(default_factory=list)

    """List of unique element tag names (sorted)"""


@dataclass(frozen=True, slots=True)
class TextMetadata:
    """Text/Markdown metadata."""

    line_count: int = 0

    """Number of lines in the document"""

    word_count: int = 0

    """Number of words"""

    character_count: int = 0

    """Number of characters"""

    headers: list[str] | None = None

    """Markdown headers (headings text only, for Markdown files)"""

    links: list[list[str]] | None = None

    """Markdown links as (text, url) tuples (for Markdown files)"""

    code_blocks: list[list[str]] | None = None

    """Code blocks as (language, code) tuples (for Markdown files)"""


@dataclass(frozen=True, slots=True)
class HtmlMetadata:
    """HTML metadata extracted from HTML documents."""

    title: str | None = None

    """Document title from `<title>` tag"""

    description: str | None = None

    """Document description from `<meta name="description">` tag"""

    keywords: list[str] = field(default_factory=list)

    """Document keywords from `<meta name="keywords">` tag, split on commas"""

    author: str | None = None

    """Document author from `<meta name="author">` tag"""

    canonical_url: str | None = None

    """Canonical URL from `<link rel="canonical">` tag"""

    base_href: str | None = None

    """Base URL from `<base href="">` tag for resolving relative URLs"""

    language: str | None = None

    """Document language from `lang` attribute"""

    text_direction: TextDirection | str | None = None

    """Document text direction from `dir` attribute"""

    open_graph: dict[str, str] = field(default_factory=dict)

    """Open Graph metadata (og:* properties) for social media Keys like "title", "description", "image", "url", etc."""

    twitter_card: dict[str, str] = field(default_factory=dict)

    """Twitter Card metadata (twitter:* properties) Keys like "card", "site", "creator", "title", "description", "image", etc."""

    meta_tags: dict[str, str] = field(default_factory=dict)

    """Additional meta tags not covered by specific fields Keys are meta name/property attributes, values are content"""

    headers: list[HeaderMetadata] = field(default_factory=list)

    """Extracted header elements with hierarchy"""

    links: list[LinkMetadata] = field(default_factory=list)

    """Extracted hyperlinks with type classification"""

    images: list[ImageMetadataType] = field(default_factory=list)

    """Extracted images with source and dimensions"""

    structured_data: list[StructuredData] = field(default_factory=list)

    """Extracted structured data blocks"""


@dataclass(frozen=True, slots=True)
class OcrMetadata:
    """OCR processing metadata."""

    language: str = ""

    """OCR language code(s) used"""

    psm: int = 0

    """Tesseract Page Segmentation Mode (PSM)"""

    output_format: str = ""

    """Output format (e.g., "text", "hocr")"""

    table_count: int = 0

    """Number of tables detected"""

    table_rows: int | None = None

    table_cols: int | None = None


@dataclass(frozen=True, slots=True)
class PptxMetadata:
    """PowerPoint presentation metadata."""

    slide_count: int = 0

    """Total number of slides in the presentation"""

    slide_names: list[str] = field(default_factory=list)

    """Names of slides (if available)"""

    image_count: int | None = None

    """Number of embedded images"""

    table_count: int | None = None

    """Number of tables"""


@dataclass(frozen=True, slots=True)
class DocxMetadata:
    """Word document metadata."""

    core_properties: CoreProperties | None = None

    """Core properties from docProps/core.xml (Dublin Core metadata)"""

    app_properties: DocxAppProperties | None = None

    """Application properties from docProps/app.xml (Word-specific statistics)"""

    custom_properties: dict[str, str] | None = None

    """Custom properties from docProps/custom.xml (user-defined properties)"""


@dataclass(frozen=True, slots=True)
class CsvMetadata:
    """CSV/TSV file metadata."""

    row_count: int = 0

    column_count: int = 0

    delimiter: str | None = None

    has_header: bool = False

    column_types: list[str] | None = None


@dataclass(frozen=True, slots=True)
class BibtexMetadata:
    """BibTeX bibliography metadata."""

    entry_count: int = 0

    """Number of entries in the bibliography."""

    citation_keys: list[str] = field(default_factory=list)

    authors: list[str] = field(default_factory=list)

    year_range: YearRange | None = None

    entry_types: dict[str, int] | None = None


@dataclass(frozen=True, slots=True)
class CitationMetadata:
    """Citation file metadata (RIS, PubMed, EndNote)."""

    citation_count: int = 0

    format: str | None = None

    authors: list[str] = field(default_factory=list)

    year_range: YearRange | None = None

    dois: list[str] = field(default_factory=list)

    keywords: list[str] = field(default_factory=list)


@dataclass(frozen=True, slots=True)
class FictionBookMetadata:
    """FictionBook (FB2) metadata."""

    genres: list[str] = field(default_factory=list)

    sequences: list[str] = field(default_factory=list)

    annotation: str | None = None


@dataclass(frozen=True, slots=True)
class DbfMetadata:
    """dBASE (DBF) file metadata."""

    record_count: int = 0

    field_count: int = 0

    fields: list[DbfFieldInfo] = field(default_factory=list)


@dataclass(frozen=True, slots=True)
class JatsMetadata:
    """JATS (Journal Article Tag Suite) metadata."""

    copyright: str | None = None

    license: str | None = None

    history_dates: dict[str, str] = field(default_factory=dict)

    contributor_roles: list[ContributorRole] = field(default_factory=list)


@dataclass(frozen=True, slots=True)
class EpubMetadata:
    """EPUB metadata (Dublin Core extensions)."""

    coverage: str | None = None

    dc_format: str | None = None

    relation: str | None = None

    source: str | None = None

    dc_type: str | None = None

    cover_image: str | None = None


@dataclass(frozen=True, slots=True)
class PstMetadata:
    """Outlook PST archive metadata."""

    message_count: int = 0


@dataclass(frozen=True, slots=True)
class OcrConfidence:
    """Confidence scores for an OCR element."""

    detection: float | None = None

    """Detection confidence: how confident the OCR engine is that text exists here."""

    recognition: float = 0.0

    """Recognition confidence: how confident about the text content."""


@dataclass(frozen=True, slots=True)
class OcrElement:
    """A unified OCR element representing detected text with full metadata."""

    text: str = ""

    """The recognized text content."""

    geometry: OcrBoundingGeometry | None = None

    """Bounding geometry (rectangle or quadrilateral)."""

    confidence: OcrConfidence | None = None

    """Confidence scores for detection and recognition."""

    level: OcrElementLevel | str = "line"

    """Hierarchical level (word, line, block, page)."""

    rotation: OcrRotation | None = None

    """Rotation information (if detected)."""

    page_number: int = 0

    """Page number (1-indexed)."""

    parent_id: str | None = None

    """Parent element ID for hierarchical relationships."""

    backend_metadata: dict[str, str] = field(default_factory=dict)

    """Backend-specific metadata that doesn't fit the unified schema."""


@dataclass(frozen=True, slots=True)
class OcrElementConfig:
    """Configuration for OCR element extraction."""

    include_elements: bool = False

    """Whether to include OCR elements in the extraction result."""

    min_level: OcrElementLevel | str = "line"

    """Minimum hierarchical level to include."""

    min_confidence: float = 0.0

    """Minimum recognition confidence threshold (0.0-1.0)."""

    build_hierarchy: bool = False

    """Whether to build hierarchical relationships between elements."""


@dataclass(frozen=True, slots=True)
class LayoutRegion:
    """A detected layout region on a page."""

    class_name: str = ""

    """Layout class name (e.g. "picture", "table", "text", "section_header")."""

    confidence: float = 0.0

    """Confidence score from the layout detection model (0.0 to 1.0)."""

    bounding_box: BoundingBox | None = None

    """Bounding box in document coordinate space."""

    area_fraction: float = 0.0

    """Fraction of the page area covered by this region (0.0 to 1.0)."""


@dataclass(frozen=True, slots=True)
class RevisionDelta:
    """The content changes that make up a single revision."""

    content: list[DiffLine] = field(default_factory=list)

    """Line-level content changes for this revision."""

    table_changes: list[CellChange] = field(default_factory=list)

    """Cell-level table changes for this revision."""


@dataclass(frozen=True, slots=True)
class Table:
    """Extracted table structure."""

    cells: list[list[str]] = field(default_factory=list)

    """Table cells as a 2D vector (rows x columns)"""

    markdown: str = ""

    """Markdown representation of the table"""

    page_number: int = 0

    """Page number where the table was found (1-indexed)"""

    bounding_box: BoundingBox | None = None

    """Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top). Only populated for PDF-extracted tables when position data is available."""


@dataclass(frozen=True, slots=True)
class TableCell:
    """Individual table cell with content and optional styling."""

    content: str = ""

    """Cell content as text"""

    row_span: int = 0

    """Row span (number of rows this cell spans)"""

    col_span: int = 0

    """Column span (number of columns this cell spans)"""

    is_header: bool = False

    """Whether this is a header cell"""


@dataclass(frozen=True, slots=True)
class DiffOptions:
    """Options controlling how two `ExtractionResult` values are compared."""

    include_metadata: bool = True

    """Include metadata changes in the diff. Default: `true`."""

    include_embedded: bool = True

    """Include embedded-children changes in the diff. Default: `true`."""

    max_content_chars: int | None = None

    """Truncate content to this many characters before diffing."""


@dataclass(frozen=True, slots=True)
class YakeParams:
    """YAKE-specific parameters."""

    window_size: int = 2

    """Window size for co-occurrence analysis (default: 2)."""


@dataclass(frozen=True, slots=True)
class RakeParams:
    """RAKE-specific parameters."""

    min_word_length: int = 1

    """Minimum word length to consider (default: 1)."""

    max_words_per_phrase: int = 3

    """Maximum words in a keyword phrase (default: 3)."""


@dataclass(frozen=True, slots=True)
class KeywordConfig:
    """Keyword extraction configuration."""

    algorithm: KeywordAlgorithm | str = "yake"

    """Algorithm to use for extraction."""

    max_keywords: int = 10

    """Maximum number of keywords to extract (default: 10)."""

    min_score: float = 0

    """Minimum score threshold (0.0-1.0, default: 0.0)."""

    ngram_range: list[int] = field(default_factory=list)

    """N-gram range for keyword extraction (min, max)."""

    language: str | None = None

    """Language code for stopword filtering (e.g., "en", "de", "fr")."""

    yake_params: YakeParams | None = None

    """YAKE-specific tuning parameters."""

    rake_params: RakeParams | None = None

    """RAKE-specific tuning parameters."""


@dataclass(frozen=True, slots=True)
class PaddleOcrConfig:
    """Configuration for PaddleOCR backend."""

    language: str = ""

    """Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra")"""

    cache_dir: str | None = None

    """Optional custom cache directory for model files"""

    use_angle_cls: bool = False

    """Enable angle classification for rotated text (default: false). Can misfire on short text regions, rotating crops incorrectly before recognition."""

    enable_table_detection: bool = False

    """Enable table structure detection (default: false)"""

    det_db_thresh: float = 0.0

    """Database threshold for text detection (default: 0.3) Range: 0.0-1.0, higher values require more confident detections"""

    det_db_box_thresh: float = 0.0

    """Box threshold for text bounding box refinement (default: 0.5) Range: 0.0-1.0"""

    det_db_unclip_ratio: float = 0.0

    """Unclip ratio for expanding text bounding boxes (default: 1.6) Controls the expansion of detected text regions"""

    det_limit_side_len: int = 0

    """Maximum side length for detection image (default: 960) Larger images may be resized to this limit for faster inference"""

    rec_batch_num: int = 0

    """Batch size for recognition inference (default: 6) Number of text regions to process simultaneously"""

    padding: int = 0

    """Padding in pixels added around the image before detection (default: 10). Large values can include surrounding content like table gridlines."""

    drop_score: float = 0.0

    """Minimum recognition confidence score for text lines (default: 0.5). Text regions with recognition confidence below this threshold are discarded. Matches PaddleOCR Python's `drop_score` parameter. Range: 0.0-1.0"""

    model_tier: str = ""

    """Model tier controlling detection/recognition model size and accuracy trade-off. - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents"""


@dataclass(frozen=True, slots=True)
class PdfMetadata:
    """PDF-specific metadata."""

    pdf_version: str | None = None

    """PDF version (e.g., "1.7", "2.0")"""

    producer: str | None = None

    """PDF producer (application that created the PDF)"""

    is_encrypted: bool | None = None

    """Whether the PDF is encrypted/password-protected"""

    width: int | None = None

    """First page width in points (1/72 inch)"""

    height: int | None = None

    """First page height in points (1/72 inch)"""

    page_count: int | None = None

    """Total number of pages in the PDF document"""


# Output format for extraction results.

OutputFormat = str
# How chunk size is measured.

ChunkSizing = str | str | None
# Embedding model types supported by Kreuzberg.

EmbeddingModelType = str | int | LlmConfig
# Format-specific metadata (discriminated union).

FormatMetadata = (

    PdfMetadata |    DocxMetadata |    ExcelMetadata |    EmailMetadata |    PptxMetadata |    ArchiveMetadata |    ImageMetadata |    XmlMetadata |    TextMetadata |    HtmlMetadata |    OcrMetadata |    CsvMetadata |    BibtexMetadata |    CitationMetadata |    FictionBookMetadata |    DbfMetadata |    JatsMetadata |    EpubMetadata |    PstMetadata |    str)

# Bounding geometry for an OCR element.

OcrBoundingGeometry = int | str
# A single line in a unified-diff hunk.

DiffLine = str