# This file is auto-generated by alef — DO NOT EDIT. # alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 # To regenerate: alef generate # To verify freshness: alef verify --exit-code # Issues & docs: https://github.com/kreuzberg-dev/alef """Configuration options for the conversion API.""" from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from typing import TYPE_CHECKING, Any, TypedDict from ._kreuzberg import ( ChunkerType, # noqa: TC001 CodeContentMode, # noqa: TC001 ExecutionProviderType, # noqa: TC001 ExtractionMethod, # noqa: TC001 HtmlTheme, # noqa: TC001 KeywordAlgorithm, # noqa: TC001 OcrElementLevel, # noqa: TC001 ReductionLevel, # noqa: TC001 ResultFormat, # noqa: TC001 TableModel, # noqa: TC001 TextDirection, # noqa: TC001 ) if TYPE_CHECKING: from ._kreuzberg import ( ArchiveEntry, # noqa: F401 CellChange, # noqa: F401 Chunk, # noqa: F401 ContributorRole, # noqa: F401 DbfFieldInfo, # noqa: F401 DjotContent, # noqa: F401 DocumentNode, # noqa: F401 DocumentRelationship, # noqa: F401 DocumentRevision, # noqa: F401 Element, # noqa: F401 ErrorMetadata, # noqa: F401 ExtractedImage, # noqa: F401 ExtractedUri, # noqa: F401 GridCell, # noqa: F401 HeaderMetadata, # noqa: F401 ImageMetadataType, # noqa: F401 ImagePreprocessingMetadata, # noqa: F401 Keyword, # noqa: F401 LinkMetadata, # noqa: F401 OcrPipelineConfig, # noqa: F401 OcrRotation, # noqa: F401 PageContent, # noqa: F401 PageStructure, # noqa: F401 PdfAnnotation, # noqa: F401 ProcessingWarning, # noqa: F401 StructuredData, # noqa: F401 StructuredExtractionConfig, # noqa: F401 YearRange, # noqa: F401 ) class ListType(str, Enum): """Type of list detection.""" bullet = "bullet" numbered = "numbered" lettered = "lettered" indented = "indented" class OcrBackendType(str, Enum): """OCR backend types.""" tesseract = "tesseract" easy_ocr = "easy_ocr" paddle_ocr = "paddle_ocr" custom = "custom" class ProcessingStage(str, Enum): """Processing stages for post-processors.""" early = "early" middle = "middle" late = "late" class PdfAnnotationType(str, Enum): """Type of PDF annotation.""" text = "text" highlight = "highlight" link = "link" stamp = "stamp" underline = "underline" strike_out = "strike_out" other = "other" class BlockType(str, Enum): """Types of block-level elements in Djot.""" paragraph = "paragraph" heading = "heading" blockquote = "blockquote" code_block = "code_block" list_item = "list_item" ordered_list = "ordered_list" bullet_list = "bullet_list" task_list = "task_list" definition_list = "definition_list" definition_term = "definition_term" definition_description = "definition_description" div = "div" section = "section" thematic_break = "thematic_break" raw_block = "raw_block" math_display = "math_display" class InlineType(str, Enum): """Types of inline elements in Djot.""" text = "text" strong = "strong" emphasis = "emphasis" highlight = "highlight" subscript = "subscript" superscript = "superscript" insert = "insert" delete = "delete" code = "code" link = "link" image = "image" span = "span" math = "math" raw_inline = "raw_inline" footnote_ref = "footnote_ref" symbol = "symbol" class RelationshipKind(str, Enum): """Semantic kind of a relationship between document elements.""" footnote_reference = "footnote_reference" citation_reference = "citation_reference" internal_link = "internal_link" caption = "caption" label = "label" toc_entry = "toc_entry" cross_reference = "cross_reference" class ContentLayer(str, Enum): """Content layer classification for document nodes.""" body = "body" header = "header" footer = "footer" footnote = "footnote" class ChunkType(str, Enum): """Semantic structural classification of a text chunk.""" heading = "heading" party_list = "party_list" definitions = "definitions" operative_clause = "operative_clause" signature_block = "signature_block" schedule = "schedule" table_like = "table_like" formula = "formula" code_block = "code_block" image = "image" org_chart = "org_chart" diagram = "diagram" unknown = "unknown" class ImageKind(str, Enum): """Heuristic classification of what an image likely depicts.""" photograph = "photograph" diagram = "diagram" chart = "chart" drawing = "drawing" text_block = "text_block" decoration = "decoration" logo = "logo" icon = "icon" tile_fragment = "tile_fragment" mask = "mask" page_raster = "page_raster" unknown = "unknown" class ElementType(str, Enum): """Semantic element type classification.""" title_ = "title" narrative_text = "narrative_text" heading = "heading" list_item = "list_item" table = "table" image = "image" page_break = "page_break" code_block = "code_block" block_quote = "block_quote" footer = "footer" header = "header" class LinkType(str, Enum): """Link type classification.""" anchor = "anchor" internal = "internal" external = "external" email = "email" phone = "phone" other = "other" class ImageType(str, Enum): """Image type classification.""" data_uri = "data-uri" inline_svg = "inline-svg" external = "external" relative = "relative" class StructuredDataType(str, Enum): """Structured data type classification.""" json_ld = "json-ld" microdata = "microdata" rd_fa = "rdfa" class PageUnitType(str, Enum): """Type of paginated unit in a document.""" page = "page" slide = "slide" sheet = "sheet" class RevisionKind(str, Enum): """Semantic classification of a tracked change.""" insertion = "insertion" deletion = "deletion" format_change = "format_change" comment = "comment" class UriKind(str, Enum): """Semantic classification of an extracted URI.""" hyperlink = "hyperlink" image = "image" anchor = "anchor" citation = "citation" reference = "reference" email = "email" class PSMMode(str, Enum): """Page Segmentation Mode for Tesseract OCR.""" osd_only = "osd_only" auto_osd = "auto_osd" auto_only = "auto_only" auto = "auto" single_column = "single_column" single_block_vertical = "single_block_vertical" single_block = "single_block" single_line = "single_line" single_word = "single_word" circle_word = "circle_word" single_char = "single_char" class PaddleLanguage(str, Enum): """Supported languages in PaddleOCR.""" english = "english" chinese = "chinese" japanese = "japanese" korean = "korean" german = "german" french = "french" latin = "latin" cyrillic = "cyrillic" traditional_chinese = "traditional_chinese" thai = "thai" greek = "greek" east_slavic = "east_slavic" arabic = "arabic" devanagari = "devanagari" tamil = "tamil" telugu = "telugu" class LayoutClass(str, Enum): """The 17 canonical document layout classes.""" caption = "caption" footnote = "footnote" formula = "formula" list_item = "list_item" page_footer = "page_footer" page_header = "page_header" picture = "picture" section_header = "section_header" table = "table" text = "text" title_ = "title" document_index = "document_index" code = "code" checkbox_selected = "checkbox_selected" checkbox_unselected = "checkbox_unselected" form = "form" key_value_region = "key_value_region" @dataclass(frozen=True, slots=True) class AccelerationConfig: """Hardware acceleration configuration for ONNX Runtime models.""" provider: ExecutionProviderType | str = "auto" """Execution provider to use for ONNX inference.""" device_id: int = 0 """GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.""" @dataclass(frozen=True, slots=True) class ContentFilterConfig: """Cross-extractor content filtering configuration.""" include_headers: bool = False """Include running headers in extraction output.""" include_footers: bool = False """Include running footers in extraction output.""" strip_repeating_text: bool = True """Enable the heuristic cross-page repeating text detector.""" include_watermarks: bool = False """Include watermark text in extraction output.""" @dataclass(frozen=True, slots=True) class EmailConfig: """Configuration for email extraction.""" msg_fallback_codepage: int | None = None """Windows codepage number to use when an MSG file contains no codepage property. Defaults to `None`, which falls back to windows-1252.""" class ExtractionConfig(TypedDict, total=False): """Main extraction configuration.""" use_cache: bool """Enable caching of extraction results""" enable_quality_processing: bool """Enable quality post-processing""" ocr: OcrConfig | None """OCR configuration (None = OCR disabled)""" force_ocr: bool """Force OCR even for searchable PDFs""" force_ocr_pages: list[int] | None """Force OCR on specific pages only (1-indexed page numbers, must be >= 1).""" disable_ocr: bool """Disable OCR entirely, even for images.""" chunking: ChunkingConfig | None """Text chunking configuration (None = chunking disabled)""" content_filter: ContentFilterConfig | None """Content filtering configuration (None = use extractor defaults).""" images: ImageExtractionConfig | None """Image extraction configuration (None = no image extraction)""" pdf_options: PdfConfig | None """PDF-specific options (None = use defaults)""" token_reduction: TokenReductionOptions | None """Token reduction configuration (None = no token reduction)""" language_detection: LanguageDetectionConfig | None """Language detection configuration (None = no language detection)""" pages: PageConfig | None """Page extraction configuration (None = no page tracking)""" keywords: KeywordConfig | None """Keyword extraction configuration (None = no keyword extraction)""" postprocessor: PostProcessorConfig | None """Post-processor configuration (None = use defaults)""" html_options: str | None """HTML to Markdown conversion options (None = use defaults)""" html_output: HtmlOutputConfig | None """Styled HTML output configuration.""" extraction_timeout_secs: int | None """Default per-file timeout in seconds for batch extraction.""" max_concurrent_extractions: int | None """Maximum concurrent extractions in batch operations (None = (num_cpus x 1.5).ceil()).""" result_format: ResultFormat | str """Result structure format""" security_limits: SecurityLimits | None """Security limits for archive extraction.""" max_embedded_file_bytes: int | None """Maximum uncompressed size in bytes for a single embedded file before recursive extraction is attempted (default: 50 MiB).""" output_format: OutputFormat """Content text format (default: Plain).""" layout: LayoutDetectionConfig | None """Layout detection configuration (None = layout detection disabled).""" use_layout_for_markdown: bool """Run layout detection on the non-OCR PDF markdown path.""" include_document_structure: bool """Enable structured document tree output.""" acceleration: AccelerationConfig | None """Hardware acceleration configuration for ONNX Runtime models.""" cache_namespace: str | None """Cache namespace for tenant isolation.""" cache_ttl_secs: int | None """Per-request cache TTL in seconds.""" email: EmailConfig | None """Email extraction configuration (None = use defaults).""" concurrency: str | None """Concurrency limits for constrained environments (None = use defaults).""" max_archive_depth: int """Maximum recursion depth for archive extraction (default: 3). Set to 0 to disable recursive extraction (legacy behavior).""" tree_sitter: TreeSitterConfig | None """Tree-sitter language pack configuration (None = tree-sitter disabled).""" structured_extraction: StructuredExtractionConfig | None """Structured extraction via LLM (None = disabled).""" cancel_token: str | None """Cancellation token for this extraction (None = no external cancellation).""" @dataclass(frozen=True, slots=True) class FileExtractionConfig: """Per-file extraction configuration overrides for batch processing.""" enable_quality_processing: bool | None = None """Override quality post-processing for this file.""" ocr: OcrConfig | None = None """Override OCR configuration for this file (None in the Option = use batch default).""" force_ocr: bool | None = None """Override force OCR for this file.""" force_ocr_pages: list[int] | None = None """Override force OCR pages for this file (1-indexed page numbers).""" disable_ocr: bool | None = None """Override disable OCR for this file.""" chunking: ChunkingConfig | None = None """Override chunking configuration for this file.""" content_filter: ContentFilterConfig | None = None """Override content filtering configuration for this file.""" images: ImageExtractionConfig | None = None """Override image extraction configuration for this file.""" pdf_options: PdfConfig | None = None """Override PDF options for this file.""" token_reduction: TokenReductionOptions | None = None """Override token reduction for this file.""" language_detection: LanguageDetectionConfig | None = None """Override language detection for this file.""" pages: PageConfig | None = None """Override page extraction for this file.""" keywords: KeywordConfig | None = None """Override keyword extraction for this file.""" postprocessor: PostProcessorConfig | None = None """Override post-processor for this file.""" html_options: str | None = None """Override HTML conversion options for this file.""" result_format: ResultFormat | str | None = None """Override result format for this file.""" output_format: OutputFormat | None = None """Override output content format for this file.""" include_document_structure: bool | None = None """Override document structure output for this file.""" layout: LayoutDetectionConfig | None = None """Override layout detection for this file.""" timeout_secs: int | None = None """Override per-file extraction timeout in seconds.""" tree_sitter: TreeSitterConfig | None = None """Override tree-sitter configuration for this file.""" structured_extraction: StructuredExtractionConfig | None = None """Override structured extraction configuration for this file.""" @dataclass(frozen=True, slots=True) class ImageExtractionConfig: """Image extraction configuration.""" extract_images: bool = True """Extract images from documents""" target_dpi: int = 300 """Target DPI for image normalization""" max_image_dimension: int = 4096 """Maximum dimension for images (width or height)""" inject_placeholders: bool = True """Whether to inject image reference placeholders into markdown output. When `true` (default), image references like `![Image 1](embedded:p1_i0)` are appended to the markdown. Set to `false` to extract images as data without polluting the markdown output.""" auto_adjust_dpi: bool = True """Automatically adjust DPI based on image content""" min_dpi: int = 72 """Minimum DPI threshold""" max_dpi: int = 600 """Maximum DPI threshold""" max_images_per_page: int | None = None """Maximum number of image objects to extract per PDF page.""" classify: bool = True """When `true` (default), extracted images are classified by kind and grouped into clusters where they appear to belong to one figure.""" include_page_rasters: bool = False """When `true`, full-page renders produced during OCR preprocessing are captured and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.""" run_ocr_on_images: bool = True """Run OCR on extracted images and include the recognized text in the document content.""" ocr_text_only: bool = False """When `true`, image OCR results are rendered as plain text without the `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images` is also `true`.""" append_ocr_text: bool = False """When `true` and `ocr_text_only` is `false`, append the OCR text after the image placeholder in the rendered output.""" @dataclass(frozen=True, slots=True) class TokenReductionOptions: """Token reduction configuration.""" mode: str = "" """Reduction mode: "off", "light", "moderate", "aggressive", "maximum" """ preserve_important_words: bool = True """Preserve important words (capitalized, technical terms)""" @dataclass(frozen=True, slots=True) class LanguageDetectionConfig: """Language detection configuration.""" enabled: bool = True """Enable language detection""" min_confidence: float = 0.8 """Minimum confidence threshold (0.0-1.0)""" detect_multiple: bool = False """Detect multiple languages in the document""" @dataclass(frozen=True, slots=True) class HtmlOutputConfig: """Configuration for styled HTML output.""" css: str | None = None """Inline CSS string injected into the output after the theme stylesheet. Concatenated after `css_file` content when both are set.""" css_file: str | None = None """Path to a CSS file loaded once at renderer construction time. Concatenated before `css` when both are set.""" theme: HtmlTheme | str = "unstyled" """Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`].""" class_prefix: str = "" """CSS class prefix applied to every emitted class name.""" embed_css: bool = True """When `true` (default), write the resolved CSS into a `