fil/packages/python/kreuzberg/_kreuzberg.pyi

# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef

from typing import Any, Literal, TypedDict

class CacheStats:
    total_files: int
    total_size_mb: float
    available_space_mb: float
    oldest_file_age_days: float
    newest_file_age_days: float
    def __init__(
        self,
        total_files: int,
        total_size_mb: float,
        available_space_mb: float,
        oldest_file_age_days: float,
        newest_file_age_days: float,
    ) -> None: ...

class AccelerationConfig:
    provider: ExecutionProviderType
    device_id: int
    def __init__(
        self,
        provider: ExecutionProviderType | str | None = None,
        device_id: int | None = None,
    ) -> None: ...

class ContentFilterConfig:
    include_headers: bool
    include_footers: bool
    strip_repeating_text: bool
    include_watermarks: bool
    def __init__(
        self,
        include_headers: bool | None = None,
        include_footers: bool | None = None,
        strip_repeating_text: bool | None = None,
        include_watermarks: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> ContentFilterConfig: ...

class EmailConfig:
    msg_fallback_codepage: int | None
    def __init__(self, msg_fallback_codepage: int | None = None) -> None: ...

class ExtractionConfig:
    use_cache: bool
    enable_quality_processing: bool
    ocr: OcrConfig | None
    force_ocr: bool
    force_ocr_pages: list[int] | None
    disable_ocr: bool
    chunking: ChunkingConfig | None
    content_filter: ContentFilterConfig | None
    images: ImageExtractionConfig | None
    pdf_options: PdfConfig | None
    token_reduction: TokenReductionOptions | None
    language_detection: LanguageDetectionConfig | None
    pages: PageConfig | None
    keywords: KeywordConfig | None
    postprocessor: PostProcessorConfig | None
    html_options: str | None
    html_output: HtmlOutputConfig | None
    extraction_timeout_secs: int | None
    max_concurrent_extractions: int | None
    result_format: ResultFormat
    security_limits: SecurityLimits | None
    max_embedded_file_bytes: int | None
    output_format: OutputFormat
    layout: LayoutDetectionConfig | None
    use_layout_for_markdown: bool
    include_document_structure: bool
    acceleration: AccelerationConfig | None
    cache_namespace: str | None
    cache_ttl_secs: int | None
    email: EmailConfig | None
    concurrency: str | None
    max_archive_depth: int
    tree_sitter: TreeSitterConfig | None
    structured_extraction: StructuredExtractionConfig | None
    cancel_token: str | None
    def __init__(
        self,
        use_cache: bool | None = None,
        enable_quality_processing: bool | None = None,
        ocr: OcrConfig | None = None,
        force_ocr: bool | None = None,
        force_ocr_pages: list[int] | None = None,
        disable_ocr: bool | None = None,
        chunking: ChunkingConfig | None = None,
        content_filter: ContentFilterConfig | None = None,
        images: ImageExtractionConfig | None = None,
        token_reduction: TokenReductionOptions | None = None,
        language_detection: LanguageDetectionConfig | None = None,
        pages: PageConfig | None = None,
        postprocessor: PostProcessorConfig | None = None,
        extraction_timeout_secs: int | None = None,
        max_concurrent_extractions: int | None = None,
        result_format: ResultFormat | str | None = None,
        security_limits: SecurityLimits | None = None,
        max_embedded_file_bytes: int | None = None,
        output_format: OutputFormat | None = None,
        use_layout_for_markdown: bool | None = None,
        include_document_structure: bool | None = None,
        acceleration: AccelerationConfig | None = None,
        cache_namespace: str | None = None,
        cache_ttl_secs: int | None = None,
        email: EmailConfig | None = None,
        concurrency: str | None = None,
        max_archive_depth: int | None = None,
        structured_extraction: StructuredExtractionConfig | None = None,
        cancel_token: str | None = None,
    ) -> None: ...
    def needs_image_processing(self) -> bool: ...
    @staticmethod
    def default() -> ExtractionConfig: ...

class FileExtractionConfig:
    enable_quality_processing: bool | None
    ocr: OcrConfig | None
    force_ocr: bool | None
    force_ocr_pages: list[int] | None
    disable_ocr: bool | None
    chunking: ChunkingConfig | None
    content_filter: ContentFilterConfig | None
    images: ImageExtractionConfig | None
    pdf_options: PdfConfig | None
    token_reduction: TokenReductionOptions | None
    language_detection: LanguageDetectionConfig | None
    pages: PageConfig | None
    keywords: KeywordConfig | None
    postprocessor: PostProcessorConfig | None
    html_options: str | None
    result_format: ResultFormat | None
    output_format: OutputFormat | None
    include_document_structure: bool | None
    layout: LayoutDetectionConfig | None
    timeout_secs: int | None
    tree_sitter: TreeSitterConfig | None
    structured_extraction: StructuredExtractionConfig | None
    def __init__(
        self,
        enable_quality_processing: bool | None = None,
        ocr: OcrConfig | None = None,
        force_ocr: bool | None = None,
        force_ocr_pages: list[int] | None = None,
        disable_ocr: bool | None = None,
        chunking: ChunkingConfig | None = None,
        content_filter: ContentFilterConfig | None = None,
        images: ImageExtractionConfig | None = None,
        token_reduction: TokenReductionOptions | None = None,
        language_detection: LanguageDetectionConfig | None = None,
        pages: PageConfig | None = None,
        postprocessor: PostProcessorConfig | None = None,
        result_format: ResultFormat | str | None = None,
        output_format: OutputFormat | None = None,
        include_document_structure: bool | None = None,
        timeout_secs: int | None = None,
        structured_extraction: StructuredExtractionConfig | None = None,
    ) -> None: ...

class BatchBytesItem:
    content: bytes
    mime_type: str
    config: FileExtractionConfig | None
    def __init__(
        self,
        content: bytes,
        mime_type: str,
        config: FileExtractionConfig | None = None,
    ) -> None: ...

class BatchFileItem:
    path: str
    config: FileExtractionConfig | None
    def __init__(self, path: str, config: FileExtractionConfig | None = None) -> None: ...

class ImageExtractionConfig:
    extract_images: bool
    target_dpi: int
    max_image_dimension: int
    inject_placeholders: bool
    auto_adjust_dpi: bool
    min_dpi: int
    max_dpi: int
    max_images_per_page: int | None
    classify: bool
    include_page_rasters: bool
    run_ocr_on_images: bool
    ocr_text_only: bool
    append_ocr_text: bool
    def __init__(
        self,
        extract_images: bool | None = None,
        target_dpi: int | None = None,
        max_image_dimension: int | None = None,
        inject_placeholders: bool | None = None,
        auto_adjust_dpi: bool | None = None,
        min_dpi: int | None = None,
        max_dpi: int | None = None,
        max_images_per_page: int | None = None,
        classify: bool | None = None,
        include_page_rasters: bool | None = None,
        run_ocr_on_images: bool | None = None,
        ocr_text_only: bool | None = None,
        append_ocr_text: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> ImageExtractionConfig: ...

class TokenReductionOptions:
    mode: str
    preserve_important_words: bool
    def __init__(
        self,
        mode: str | None = None,
        preserve_important_words: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> TokenReductionOptions: ...

class LanguageDetectionConfig:
    enabled: bool
    min_confidence: float
    detect_multiple: bool
    def __init__(
        self,
        enabled: bool | None = None,
        min_confidence: float | None = None,
        detect_multiple: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> LanguageDetectionConfig: ...

class HtmlOutputConfig:
    css: str | None
    css_file: str | None
    theme: HtmlTheme
    class_prefix: str
    embed_css: bool
    def __init__(
        self,
        css: str | None = None,
        css_file: str | None = None,
        theme: HtmlTheme | str | None = None,
        class_prefix: str | None = None,
        embed_css: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> HtmlOutputConfig: ...

class LayoutDetectionConfig:
    confidence_threshold: float | None
    apply_heuristics: bool
    table_model: TableModel
    acceleration: AccelerationConfig | None
    def __init__(
        self,
        confidence_threshold: float | None = None,
        apply_heuristics: bool | None = None,
        table_model: TableModel | str | None = None,
        acceleration: AccelerationConfig | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> LayoutDetectionConfig: ...

class LlmConfig:
    model: str
    api_key: str | None
    base_url: str | None
    timeout_secs: int | None
    max_retries: int | None
    temperature: float | None
    max_tokens: int | None
    def __init__(
        self,
        model: str | None = None,
        api_key: str | None = None,
        base_url: str | None = None,
        timeout_secs: int | None = None,
        max_retries: int | None = None,
        temperature: float | None = None,
        max_tokens: int | None = None,
    ) -> None: ...

class StructuredExtractionConfig:
    schema: dict[str, Any]
    schema_name: str
    schema_description: str | None
    strict: bool
    prompt: str | None
    llm: LlmConfig
    def __init__(
        self,
        schema: dict[str, Any],
        schema_name: str,
        strict: bool,
        llm: LlmConfig,
        schema_description: str | None = None,
        prompt: str | None = None,
    ) -> None: ...

class OcrQualityThresholds:
    min_total_non_whitespace: int
    min_non_whitespace_per_page: float
    min_meaningful_word_len: int
    min_meaningful_words: int
    min_alnum_ratio: float
    min_garbage_chars: int
    max_fragmented_word_ratio: float
    critical_fragmented_word_ratio: float
    min_avg_word_length: float
    min_words_for_avg_length_check: int
    min_consecutive_repeat_ratio: float
    min_words_for_repeat_check: int
    substantive_min_chars: int
    non_text_min_chars: int
    alnum_ws_ratio_threshold: float
    pipeline_min_quality: float
    def __init__(
        self,
        min_total_non_whitespace: int | None = None,
        min_non_whitespace_per_page: float | None = None,
        min_meaningful_word_len: int | None = None,
        min_meaningful_words: int | None = None,
        min_alnum_ratio: float | None = None,
        min_garbage_chars: int | None = None,
        max_fragmented_word_ratio: float | None = None,
        critical_fragmented_word_ratio: float | None = None,
        min_avg_word_length: float | None = None,
        min_words_for_avg_length_check: int | None = None,
        min_consecutive_repeat_ratio: float | None = None,
        min_words_for_repeat_check: int | None = None,
        substantive_min_chars: int | None = None,
        non_text_min_chars: int | None = None,
        alnum_ws_ratio_threshold: float | None = None,
        pipeline_min_quality: float | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> OcrQualityThresholds: ...

class OcrPipelineStage:
    backend: str
    priority: int
    language: str | None
    tesseract_config: TesseractConfig | None
    paddle_ocr_config: dict[str, Any] | None
    vlm_config: LlmConfig | None
    backend_options: dict[str, Any] | None
    def __init__(
        self,
        backend: str,
        priority: int,
        language: str | None = None,
        tesseract_config: TesseractConfig | None = None,
        paddle_ocr_config: dict[str, Any] | None = None,
        vlm_config: LlmConfig | None = None,
        backend_options: dict[str, Any] | None = None,
    ) -> None: ...

class OcrPipelineConfig:
    stages: list[OcrPipelineStage]
    quality_thresholds: OcrQualityThresholds
    def __init__(
        self,
        stages: list[OcrPipelineStage],
        quality_thresholds: OcrQualityThresholds,
    ) -> None: ...

class OcrConfig:
    enabled: bool
    backend: str
    language: str
    tesseract_config: TesseractConfig | None
    output_format: OutputFormat | None
    paddle_ocr_config: dict[str, Any] | None
    backend_options: dict[str, Any] | None
    element_config: OcrElementConfig | None
    quality_thresholds: OcrQualityThresholds | None
    pipeline: OcrPipelineConfig | None
    auto_rotate: bool
    vlm_config: LlmConfig | None
    vlm_prompt: str | None
    acceleration: AccelerationConfig | None
    tessdata_bytes: dict[str, bytes] | None
    def __init__(
        self,
        enabled: bool | None = None,
        backend: str | None = None,
        language: str | None = None,
        tesseract_config: TesseractConfig | None = None,
        output_format: OutputFormat | None = None,
        paddle_ocr_config: dict[str, Any] | None = None,
        backend_options: dict[str, Any] | None = None,
        element_config: OcrElementConfig | None = None,
        quality_thresholds: OcrQualityThresholds | None = None,
        pipeline: OcrPipelineConfig | None = None,
        auto_rotate: bool | None = None,
        vlm_config: LlmConfig | None = None,
        vlm_prompt: str | None = None,
        acceleration: AccelerationConfig | None = None,
        tessdata_bytes: dict[str, bytes] | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> OcrConfig: ...

class PageConfig:
    extract_pages: bool
    insert_page_markers: bool
    marker_format: str
    def __init__(
        self,
        extract_pages: bool | None = None,
        insert_page_markers: bool | None = None,
        marker_format: str | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> PageConfig: ...

class PdfConfig:
    extract_images: bool
    extract_tables: bool
    passwords: list[str] | None
    extract_metadata: bool
    hierarchy: HierarchyConfig | None
    extract_annotations: bool
    top_margin_fraction: float | None
    bottom_margin_fraction: float | None
    allow_single_column_tables: bool
    ocr_inline_images: bool
    def __init__(
        self,
        extract_images: bool | None = None,
        extract_tables: bool | None = None,
        passwords: list[str] | None = None,
        extract_metadata: bool | None = None,
        hierarchy: HierarchyConfig | None = None,
        extract_annotations: bool | None = None,
        top_margin_fraction: float | None = None,
        bottom_margin_fraction: float | None = None,
        allow_single_column_tables: bool | None = None,
        ocr_inline_images: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> PdfConfig: ...

class HierarchyConfig:
    enabled: bool
    k_clusters: int
    include_bbox: bool
    ocr_coverage_threshold: float | None
    def __init__(
        self,
        enabled: bool | None = None,
        k_clusters: int | None = None,
        include_bbox: bool | None = None,
        ocr_coverage_threshold: float | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> HierarchyConfig: ...

class PostProcessorConfig:
    enabled: bool
    enabled_processors: list[str] | None
    disabled_processors: list[str] | None
    enabled_set: list[str] | None
    disabled_set: list[str] | None
    def __init__(
        self,
        enabled: bool | None = None,
        enabled_processors: list[str] | None = None,
        disabled_processors: list[str] | None = None,
        enabled_set: list[str] | None = None,
        disabled_set: list[str] | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> PostProcessorConfig: ...

class ChunkingConfig:
    max_characters: int
    overlap: int
    trim: bool
    chunker_type: ChunkerType
    embedding: EmbeddingConfig | None
    preset: str | None
    sizing: ChunkSizing
    prepend_heading_context: bool
    topic_threshold: float | None
    def __init__(
        self,
        max_characters: int | None = None,
        overlap: int | None = None,
        trim: bool | None = None,
        chunker_type: ChunkerType | str | None = None,
        embedding: EmbeddingConfig | None = None,
        preset: str | None = None,
        sizing: ChunkSizing | None = None,
        prepend_heading_context: bool | None = None,
        topic_threshold: float | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> ChunkingConfig: ...

class EmbeddingConfig:
    model: EmbeddingModelType
    normalize: bool
    batch_size: int
    show_download_progress: bool
    cache_dir: str | None
    acceleration: AccelerationConfig | None
    max_embed_duration_secs: int | None
    def __init__(
        self,
        model: EmbeddingModelType | None = None,
        normalize: bool | None = None,
        batch_size: int | None = None,
        show_download_progress: bool | None = None,
        cache_dir: str | None = None,
        acceleration: AccelerationConfig | None = None,
        max_embed_duration_secs: int | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> EmbeddingConfig: ...

class TreeSitterConfig:
    enabled: bool
    cache_dir: str | None
    languages: list[str] | None
    groups: list[str] | None
    process: TreeSitterProcessConfig
    def __init__(
        self,
        enabled: bool | None = None,
        cache_dir: str | None = None,
        languages: list[str] | None = None,
        groups: list[str] | None = None,
        process: TreeSitterProcessConfig | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> TreeSitterConfig: ...

class TreeSitterProcessConfig:
    structure: bool
    imports: bool
    exports: bool
    comments: bool
    docstrings: bool
    symbols: bool
    diagnostics: bool
    chunk_max_size: int | None
    content_mode: CodeContentMode
    def __init__(
        self,
        structure: bool | None = None,
        imports: bool | None = None,
        exports: bool | None = None,
        comments: bool | None = None,
        docstrings: bool | None = None,
        symbols: bool | None = None,
        diagnostics: bool | None = None,
        chunk_max_size: int | None = None,
        content_mode: CodeContentMode | str | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> TreeSitterProcessConfig: ...

class SupportedFormat:
    extension: str
    mime_type: str
    def __init__(self, extension: str, mime_type: str) -> None: ...

class ServerConfig:
    host: str
    port: int
    cors_origins: list[str]
    max_request_body_bytes: int
    max_multipart_field_bytes: int
    def __init__(
        self,
        host: str | None = None,
        port: int | None = None,
        cors_origins: list[str] | None = None,
        max_request_body_bytes: int | None = None,
        max_multipart_field_bytes: int | None = None,
    ) -> None: ...
    def listen_addr(self) -> str: ...
    def cors_allows_all(self) -> bool: ...
    def is_origin_allowed(self, origin: str) -> bool: ...
    def max_request_body_mb(self) -> int: ...
    def max_multipart_field_mb(self) -> int: ...
    @staticmethod
    def default() -> ServerConfig: ...

class StructuredDataResult:
    content: str
    format: str
    metadata: dict[str, str]
    text_fields: list[str]
    def __init__(
        self,
        content: str,
        format: str,  # noqa: A002
        metadata: dict[str, str],
        text_fields: list[str],
    ) -> None: ...

class DocxAppProperties:
    application: str | None
    app_version: str | None
    template: str | None
    total_time: int | None
    pages: int | None
    words: int | None
    characters: int | None
    characters_with_spaces: int | None
    lines: int | None
    paragraphs: int | None
    company: str | None
    doc_security: int | None
    scale_crop: bool | None
    links_up_to_date: bool | None
    shared_doc: bool | None
    hyperlinks_changed: bool | None
    def __init__(
        self,
        application: str | None = None,
        app_version: str | None = None,
        template: str | None = None,
        total_time: int | None = None,
        pages: int | None = None,
        words: int | None = None,
        characters: int | None = None,
        characters_with_spaces: int | None = None,
        lines: int | None = None,
        paragraphs: int | None = None,
        company: str | None = None,
        doc_security: int | None = None,
        scale_crop: bool | None = None,
        links_up_to_date: bool | None = None,
        shared_doc: bool | None = None,
        hyperlinks_changed: bool | None = None,
    ) -> None: ...

class XlsxAppProperties:
    application: str | None
    app_version: str | None
    doc_security: int | None
    scale_crop: bool | None
    links_up_to_date: bool | None
    shared_doc: bool | None
    hyperlinks_changed: bool | None
    company: str | None
    worksheet_names: list[str]
    def __init__(
        self,
        application: str | None = None,
        app_version: str | None = None,
        doc_security: int | None = None,
        scale_crop: bool | None = None,
        links_up_to_date: bool | None = None,
        shared_doc: bool | None = None,
        hyperlinks_changed: bool | None = None,
        company: str | None = None,
        worksheet_names: list[str] | None = None,
    ) -> None: ...

class PptxAppProperties:
    application: str | None
    app_version: str | None
    total_time: int | None
    company: str | None
    doc_security: int | None
    scale_crop: bool | None
    links_up_to_date: bool | None
    shared_doc: bool | None
    hyperlinks_changed: bool | None
    slides: int | None
    notes: int | None
    hidden_slides: int | None
    multimedia_clips: int | None
    presentation_format: str | None
    slide_titles: list[str]
    def __init__(
        self,
        application: str | None = None,
        app_version: str | None = None,
        total_time: int | None = None,
        company: str | None = None,
        doc_security: int | None = None,
        scale_crop: bool | None = None,
        links_up_to_date: bool | None = None,
        shared_doc: bool | None = None,
        hyperlinks_changed: bool | None = None,
        slides: int | None = None,
        notes: int | None = None,
        hidden_slides: int | None = None,
        multimedia_clips: int | None = None,
        presentation_format: str | None = None,
        slide_titles: list[str] | None = None,
    ) -> None: ...

class CoreProperties:
    title: str | None
    subject: str | None
    creator: str | None
    keywords: str | None
    description: str | None
    last_modified_by: str | None
    revision: str | None
    created: str | None
    modified: str | None
    category: str | None
    content_status: str | None
    language: str | None
    identifier: str | None
    version: str | None
    last_printed: str | None
    def __init__(
        self,
        title: str | None = None,
        subject: str | None = None,
        creator: str | None = None,
        keywords: str | None = None,
        description: str | None = None,
        last_modified_by: str | None = None,
        revision: str | None = None,
        created: str | None = None,
        modified: str | None = None,
        category: str | None = None,
        content_status: str | None = None,
        language: str | None = None,
        identifier: str | None = None,
        version: str | None = None,
        last_printed: str | None = None,
    ) -> None: ...

class SecurityLimits:
    max_archive_size: int
    max_compression_ratio: int
    max_files_in_archive: int
    max_nesting_depth: int
    max_entity_length: int
    max_content_size: int
    max_iterations: int
    max_xml_depth: int
    max_table_cells: int
    def __init__(
        self,
        max_archive_size: int | None = None,
        max_compression_ratio: int | None = None,
        max_files_in_archive: int | None = None,
        max_nesting_depth: int | None = None,
        max_entity_length: int | None = None,
        max_content_size: int | None = None,
        max_iterations: int | None = None,
        max_xml_depth: int | None = None,
        max_table_cells: int | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> SecurityLimits: ...

class TokenReductionConfig:
    level: ReductionLevel
    language_hint: str | None
    preserve_markdown: bool
    preserve_code: bool
    semantic_threshold: float
    enable_parallel: bool
    use_simd: bool
    custom_stopwords: dict[str, list[str]] | None
    preserve_patterns: list[str]
    target_reduction: float | None
    enable_semantic_clustering: bool
    def __init__(
        self,
        level: ReductionLevel | str | None = None,
        language_hint: str | None = None,
        preserve_markdown: bool | None = None,
        preserve_code: bool | None = None,
        semantic_threshold: float | None = None,
        enable_parallel: bool | None = None,
        use_simd: bool | None = None,
        custom_stopwords: dict[str, list[str]] | None = None,
        preserve_patterns: list[str] | None = None,
        target_reduction: float | None = None,
        enable_semantic_clustering: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> TokenReductionConfig: ...

class PdfAnnotation:
    annotation_type: PdfAnnotationType
    content: str | None
    page_number: int
    bounding_box: BoundingBox | None
    def __init__(
        self,
        annotation_type: PdfAnnotationType | str,
        page_number: int,
        content: str | None = None,
        bounding_box: BoundingBox | None = None,
    ) -> None: ...

class DjotContent:
    plain_text: str
    blocks: list[FormattedBlock]
    metadata: Metadata
    tables: list[Table]
    images: list[DjotImage]
    links: list[DjotLink]
    footnotes: list[Footnote]
    attributes: list[str]
    def __init__(
        self,
        plain_text: str,
        blocks: list[FormattedBlock],
        metadata: Metadata,
        tables: list[Table],
        images: list[DjotImage],
        links: list[DjotLink],
        footnotes: list[Footnote],
        attributes: list[str],
    ) -> None: ...

class FormattedBlock:
    block_type: BlockType
    level: int | None
    inline_content: list[InlineElement]
    attributes: str | None
    language: str | None
    code: str | None
    children: list[FormattedBlock]
    def __init__(
        self,
        block_type: BlockType | str,
        inline_content: list[InlineElement],
        children: list[FormattedBlock],
        level: int | None = None,
        attributes: str | None = None,
        language: str | None = None,
        code: str | None = None,
    ) -> None: ...

class InlineElement:
    element_type: InlineType
    content: str
    attributes: str | None
    metadata: dict[str, str] | None
    def __init__(
        self,
        element_type: InlineType | str,
        content: str,
        attributes: str | None = None,
        metadata: dict[str, str] | None = None,
    ) -> None: ...

class DjotImage:
    src: str
    alt: str
    title: str | None
    attributes: str | None
    def __init__(
        self,
        src: str,
        alt: str,
        title: str | None = None,
        attributes: str | None = None,
    ) -> None: ...

class DjotLink:
    url: str
    text: str
    title: str | None
    attributes: str | None
    def __init__(
        self,
        url: str,
        text: str,
        title: str | None = None,
        attributes: str | None = None,
    ) -> None: ...

class Footnote:
    label: str
    content: list[FormattedBlock]
    def __init__(self, label: str, content: list[FormattedBlock]) -> None: ...

class DocumentStructure:
    nodes: list[DocumentNode]
    source_format: str | None
    relationships: list[DocumentRelationship]
    node_types: list[str]
    def __init__(
        self,
        nodes: list[DocumentNode] | None = None,
        source_format: str | None = None,
        relationships: list[DocumentRelationship] | None = None,
        node_types: list[str] | None = None,
    ) -> None: ...
    def finalize_node_types(self) -> None: ...
    def is_empty(self) -> bool: ...
    @staticmethod
    def default() -> DocumentStructure: ...

class DocumentRelationship:
    source: int
    target: int
    kind: RelationshipKind
    def __init__(self, source: int, target: int, kind: RelationshipKind | str) -> None: ...

class DocumentNode:
    id: str
    content: NodeContent
    parent: int | None
    children: list[int]
    content_layer: ContentLayer
    page: int | None
    page_end: int | None
    bbox: BoundingBox | None
    annotations: list[TextAnnotation]
    attributes: dict[str, str] | None
    def __init__(
        self,
        id: str,  # noqa: A002
        content: NodeContent,
        children: list[int],
        content_layer: ContentLayer | str,
        annotations: list[TextAnnotation],
        parent: int | None = None,
        page: int | None = None,
        page_end: int | None = None,
        bbox: BoundingBox | None = None,
        attributes: dict[str, str] | None = None,
    ) -> None: ...

class TableGrid:
    rows: int
    cols: int
    cells: list[GridCell]
    def __init__(
        self,
        rows: int | None = None,
        cols: int | None = None,
        cells: list[GridCell] | None = None,
    ) -> None: ...

class GridCell:
    content: str
    row: int
    col: int
    row_span: int
    col_span: int
    is_header: bool
    bbox: BoundingBox | None
    def __init__(
        self,
        content: str,
        row: int,
        col: int,
        row_span: int,
        col_span: int,
        is_header: bool,
        bbox: BoundingBox | None = None,
    ) -> None: ...

class TextAnnotation:
    start: int
    end: int
    kind: AnnotationKind
    def __init__(self, start: int, end: int, kind: AnnotationKind) -> None: ...

class ExtractionResult:
    content: str
    mime_type: str
    metadata: Metadata
    extraction_method: ExtractionMethod | None
    tables: list[Table]
    detected_languages: list[str] | None
    chunks: list[Chunk] | None
    images: list[ExtractedImage] | None
    pages: list[PageContent] | None
    elements: list[Element] | None
    djot_content: DjotContent | None
    ocr_elements: list[OcrElement] | None
    document: DocumentStructure | None
    extracted_keywords: list[Keyword] | None
    quality_score: float | None
    processing_warnings: list[ProcessingWarning]
    annotations: list[PdfAnnotation] | None
    children: list[ArchiveEntry] | None
    uris: list[ExtractedUri] | None
    revisions: list[DocumentRevision] | None
    structured_output: dict[str, Any] | None
    code_intelligence: dict[str, Any] | None
    llm_usage: list[LlmUsage] | None
    formatted_content: str | None
    ocr_internal_document: str | None
    def __init__(
        self,
        content: str | None = None,
        mime_type: str | None = None,
        metadata: Metadata | None = None,
        extraction_method: ExtractionMethod | str | None = None,
        tables: list[Table] | None = None,
        detected_languages: list[str] | None = None,
        chunks: list[Chunk] | None = None,
        images: list[ExtractedImage] | None = None,
        pages: list[PageContent] | None = None,
        elements: list[Element] | None = None,
        djot_content: DjotContent | None = None,
        ocr_elements: list[OcrElement] | None = None,
        document: DocumentStructure | None = None,
        quality_score: float | None = None,
        processing_warnings: list[ProcessingWarning] | None = None,
        annotations: list[PdfAnnotation] | None = None,
        children: list[ArchiveEntry] | None = None,
        uris: list[ExtractedUri] | None = None,
        revisions: list[DocumentRevision] | None = None,
        structured_output: dict[str, Any] | None = None,
        llm_usage: list[LlmUsage] | None = None,
        formatted_content: str | None = None,
        ocr_internal_document: str | None = None,
    ) -> None: ...
    @staticmethod
    def from_ocr(ocr: OcrExtractionResult) -> ExtractionResult: ...

class ArchiveEntry:
    path: str
    mime_type: str
    result: ExtractionResult
    def __init__(self, path: str, mime_type: str, result: ExtractionResult) -> None: ...

class ProcessingWarning:
    source: str
    message: str
    def __init__(self, source: str, message: str) -> None: ...

class LlmUsage:
    model: str
    source: str
    input_tokens: int | None
    output_tokens: int | None
    total_tokens: int | None
    estimated_cost: float | None
    finish_reason: str | None
    def __init__(
        self,
        model: str | None = None,
        source: str | None = None,
        input_tokens: int | None = None,
        output_tokens: int | None = None,
        total_tokens: int | None = None,
        estimated_cost: float | None = None,
        finish_reason: str | None = None,
    ) -> None: ...

class Chunk:
    content: str
    chunk_type: ChunkType
    embedding: list[float] | None
    metadata: ChunkMetadata
    def __init__(
        self,
        content: str,
        chunk_type: ChunkType | str,
        metadata: ChunkMetadata,
        embedding: list[float] | None = None,
    ) -> None: ...

class HeadingContext:
    headings: list[HeadingLevel]
    def __init__(self, headings: list[HeadingLevel]) -> None: ...

class HeadingLevel:
    level: int
    text: str
    def __init__(self, level: int, text: str) -> None: ...

class ChunkMetadata:
    byte_start: int
    byte_end: int
    token_count: int | None
    chunk_index: int
    total_chunks: int
    first_page: int | None
    last_page: int | None
    heading_context: HeadingContext | None
    image_indices: list[int]
    def __init__(
        self,
        byte_start: int,
        byte_end: int,
        chunk_index: int,
        total_chunks: int,
        image_indices: list[int],
        token_count: int | None = None,
        first_page: int | None = None,
        last_page: int | None = None,
        heading_context: HeadingContext | None = None,
    ) -> None: ...

class ExtractedImage:
    data: bytes
    format: str
    image_index: int
    page_number: int | None
    width: int | None
    height: int | None
    colorspace: str | None
    bits_per_component: int | None
    is_mask: bool
    description: str | None
    ocr_result: ExtractionResult | None
    bounding_box: BoundingBox | None
    source_path: str | None
    image_kind: ImageKind | None
    kind_confidence: float | None
    cluster_id: int | None
    def __init__(
        self,
        data: bytes,
        format: str,  # noqa: A002
        image_index: int,
        is_mask: bool,
        page_number: int | None = None,
        width: int | None = None,
        height: int | None = None,
        colorspace: str | None = None,
        bits_per_component: int | None = None,
        description: str | None = None,
        ocr_result: ExtractionResult | None = None,
        bounding_box: BoundingBox | None = None,
        source_path: str | None = None,
        image_kind: ImageKind | str | None = None,
        kind_confidence: float | None = None,
        cluster_id: int | None = None,
    ) -> None: ...

class BoundingBox:
    x0: float
    y0: float
    x1: float
    y1: float
    def __init__(
        self,
        x0: float | None = None,
        y0: float | None = None,
        x1: float | None = None,
        y1: float | None = None,
    ) -> None: ...

class ElementMetadata:
    page_number: int | None
    filename: str | None
    coordinates: BoundingBox | None
    element_index: int | None
    additional: dict[str, str]
    def __init__(
        self,
        additional: dict[str, str],
        page_number: int | None = None,
        filename: str | None = None,
        coordinates: BoundingBox | None = None,
        element_index: int | None = None,
    ) -> None: ...

class Element:
    element_id: str
    element_type: ElementType
    text: str
    metadata: ElementMetadata
    def __init__(
        self,
        element_id: str,
        element_type: ElementType | str,
        text: str,
        metadata: ElementMetadata,
    ) -> None: ...

class ExcelWorkbook:
    sheets: list[ExcelSheet]
    metadata: dict[str, str]
    revisions: list[DocumentRevision] | None
    def __init__(
        self,
        sheets: list[ExcelSheet],
        metadata: dict[str, str],
        revisions: list[DocumentRevision] | None = None,
    ) -> None: ...

class ExcelSheet:
    name: str
    markdown: str
    row_count: int
    col_count: int
    cell_count: int
    table_cells: list[list[str]] | None
    def __init__(
        self,
        name: str,
        markdown: str,
        row_count: int,
        col_count: int,
        cell_count: int,
        table_cells: list[list[str]] | None = None,
    ) -> None: ...

class XmlExtractionResult:
    content: str
    element_count: int
    unique_elements: list[str]
    def __init__(self, content: str, element_count: int, unique_elements: list[str]) -> None: ...

class TextExtractionResult:
    content: str
    line_count: int
    word_count: int
    character_count: int
    headers: list[str] | None
    links: list[list[str]] | None
    code_blocks: list[list[str]] | None
    def __init__(
        self,
        content: str,
        line_count: int,
        word_count: int,
        character_count: int,
        headers: list[str] | None = None,
        links: list[list[str]] | None = None,
        code_blocks: list[list[str]] | None = None,
    ) -> None: ...

class PptxExtractionResult:
    content: str
    metadata: PptxMetadata
    slide_count: int
    image_count: int
    table_count: int
    images: list[ExtractedImage]
    page_structure: PageStructure | None
    page_contents: list[PageContent] | None
    document: DocumentStructure | None
    hyperlinks: list[str]
    office_metadata: dict[str, str]
    revisions: list[DocumentRevision] | None
    def __init__(
        self,
        content: str,
        metadata: PptxMetadata,
        slide_count: int,
        image_count: int,
        table_count: int,
        images: list[ExtractedImage],
        hyperlinks: list[str],
        office_metadata: dict[str, str],
        page_structure: PageStructure | None = None,
        page_contents: list[PageContent] | None = None,
        document: DocumentStructure | None = None,
        revisions: list[DocumentRevision] | None = None,
    ) -> None: ...

class EmailExtractionResult:
    subject: str | None
    from_email: str | None
    to_emails: list[str]
    cc_emails: list[str]
    bcc_emails: list[str]
    date: str | None
    message_id: str | None
    plain_text: str | None
    html_content: str | None
    content: str
    attachments: list[EmailAttachment]
    metadata: dict[str, str]
    def __init__(
        self,
        to_emails: list[str],
        cc_emails: list[str],
        bcc_emails: list[str],
        content: str,
        attachments: list[EmailAttachment],
        metadata: dict[str, str],
        subject: str | None = None,
        from_email: str | None = None,
        date: str | None = None,
        message_id: str | None = None,
        plain_text: str | None = None,
        html_content: str | None = None,
    ) -> None: ...

class EmailAttachment:
    name: str | None
    filename: str | None
    mime_type: str | None
    size: int | None
    is_image: bool
    data: bytes | None
    def __init__(
        self,
        is_image: bool,
        name: str | None = None,
        filename: str | None = None,
        mime_type: str | None = None,
        size: int | None = None,
        data: bytes | None = None,
    ) -> None: ...

class OcrExtractionResult:
    content: str
    mime_type: str
    metadata: dict[str, dict[str, Any]]
    tables: list[OcrTable]
    ocr_elements: list[OcrElement] | None
    internal_document: str | None
    def __init__(
        self,
        content: str,
        mime_type: str,
        metadata: dict[str, dict[str, Any]],
        tables: list[OcrTable],
        ocr_elements: list[OcrElement] | None = None,
        internal_document: str | None = None,
    ) -> None: ...

class OcrTable:
    cells: list[list[str]]
    markdown: str
    page_number: int
    bounding_box: OcrTableBoundingBox | None
    def __init__(
        self,
        cells: list[list[str]],
        markdown: str,
        page_number: int,
        bounding_box: OcrTableBoundingBox | None = None,
    ) -> None: ...

class OcrTableBoundingBox:
    left: int
    top: int
    right: int
    bottom: int
    def __init__(self, left: int, top: int, right: int, bottom: int) -> None: ...

class ImagePreprocessingConfig:
    target_dpi: int
    auto_rotate: bool
    deskew: bool
    denoise: bool
    contrast_enhance: bool
    binarization_method: str
    invert_colors: bool
    def __init__(
        self,
        target_dpi: int | None = None,
        auto_rotate: bool | None = None,
        deskew: bool | None = None,
        denoise: bool | None = None,
        contrast_enhance: bool | None = None,
        binarization_method: str | None = None,
        invert_colors: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> ImagePreprocessingConfig: ...

class TesseractConfig:
    language: str
    psm: int
    output_format: str
    oem: int
    min_confidence: float
    preprocessing: ImagePreprocessingConfig | None
    enable_table_detection: bool
    table_min_confidence: float
    table_column_threshold: int
    table_row_threshold_ratio: float
    use_cache: bool
    classify_use_pre_adapted_templates: bool
    language_model_ngram_on: bool
    tessedit_dont_blkrej_good_wds: bool
    tessedit_dont_rowrej_good_wds: bool
    tessedit_enable_dict_correction: bool
    tessedit_char_whitelist: str
    tessedit_char_blacklist: str
    tessedit_use_primary_params_model: bool
    textord_space_size_is_variable: bool
    thresholding_method: bool
    def __init__(
        self,
        language: str | None = None,
        psm: int | None = None,
        output_format: str | None = None,
        oem: int | None = None,
        min_confidence: float | None = None,
        preprocessing: ImagePreprocessingConfig | None = None,
        enable_table_detection: bool | None = None,
        table_min_confidence: float | None = None,
        table_column_threshold: int | None = None,
        table_row_threshold_ratio: float | None = None,
        use_cache: bool | None = None,
        classify_use_pre_adapted_templates: bool | None = None,
        language_model_ngram_on: bool | None = None,
        tessedit_dont_blkrej_good_wds: bool | None = None,
        tessedit_dont_rowrej_good_wds: bool | None = None,
        tessedit_enable_dict_correction: bool | None = None,
        tessedit_char_whitelist: str | None = None,
        tessedit_char_blacklist: str | None = None,
        tessedit_use_primary_params_model: bool | None = None,
        textord_space_size_is_variable: bool | None = None,
        thresholding_method: bool | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> TesseractConfig: ...

class ImagePreprocessingMetadata:
    original_dimensions: list[int]
    original_dpi: list[float]
    target_dpi: int
    scale_factor: float
    auto_adjusted: bool
    final_dpi: int
    new_dimensions: list[int] | None
    resample_method: str
    dimension_clamped: bool
    calculated_dpi: int | None
    skipped_resize: bool
    resize_error: str | None
    def __init__(
        self,
        original_dimensions: list[int],
        original_dpi: list[float],
        target_dpi: int,
        scale_factor: float,
        auto_adjusted: bool,
        final_dpi: int,
        resample_method: str,
        dimension_clamped: bool,
        skipped_resize: bool,
        new_dimensions: list[int] | None = None,
        calculated_dpi: int | None = None,
        resize_error: str | None = None,
    ) -> None: ...

class Metadata:
    title: str | None
    subject: str | None
    authors: list[str] | None
    keywords: list[str] | None
    language: str | None
    created_at: str | None
    modified_at: str | None
    created_by: str | None
    modified_by: str | None
    pages: PageStructure | None
    format: FormatMetadata | None
    image_preprocessing: ImagePreprocessingMetadata | None
    json_schema: dict[str, Any] | None
    error: ErrorMetadata | None
    extraction_duration_ms: int | None
    category: str | None
    tags: list[str] | None
    document_version: str | None
    abstract_text: str | None
    output_format: str | None
    ocr_used: bool
    additional: dict[str, dict[str, Any]]
    def __init__(
        self,
        title: str | None = None,
        subject: str | None = None,
        authors: list[str] | None = None,
        keywords: list[str] | None = None,
        language: str | None = None,
        created_at: str | None = None,
        modified_at: str | None = None,
        created_by: str | None = None,
        modified_by: str | None = None,
        pages: PageStructure | None = None,
        format: FormatMetadata | None = None,  # noqa: A002
        image_preprocessing: ImagePreprocessingMetadata | None = None,
        json_schema: dict[str, Any] | None = None,
        error: ErrorMetadata | None = None,
        extraction_duration_ms: int | None = None,
        category: str | None = None,
        tags: list[str] | None = None,
        document_version: str | None = None,
        abstract_text: str | None = None,
        output_format: str | None = None,
        ocr_used: bool | None = None,
        additional: dict[str, dict[str, Any]] | None = None,
    ) -> None: ...
    def is_empty(self) -> bool: ...

class ExcelMetadata:
    sheet_count: int | None
    sheet_names: list[str] | None
    def __init__(
        self,
        sheet_count: int | None = None,
        sheet_names: list[str] | None = None,
    ) -> None: ...

class EmailMetadata:
    from_email: str | None
    from_name: str | None
    to_emails: list[str]
    cc_emails: list[str]
    bcc_emails: list[str]
    message_id: str | None
    attachments: list[str]
    def __init__(
        self,
        from_email: str | None = None,
        from_name: str | None = None,
        to_emails: list[str] | None = None,
        cc_emails: list[str] | None = None,
        bcc_emails: list[str] | None = None,
        message_id: str | None = None,
        attachments: list[str] | None = None,
    ) -> None: ...

class ArchiveMetadata:
    format: str
    file_count: int
    file_list: list[str]
    total_size: int
    compressed_size: int | None
    def __init__(
        self,
        format: str | None = None,  # noqa: A002
        file_count: int | None = None,
        file_list: list[str] | None = None,
        total_size: int | None = None,
        compressed_size: int | None = None,
    ) -> None: ...

class ImageMetadata:
    width: int
    height: int
    format: str
    exif: dict[str, str]
    def __init__(
        self,
        width: int | None = None,
        height: int | None = None,
        format: str | None = None,  # noqa: A002
        exif: dict[str, str] | None = None,
    ) -> None: ...

class XmlMetadata:
    element_count: int
    unique_elements: list[str]
    def __init__(
        self,
        element_count: int | None = None,
        unique_elements: list[str] | None = None,
    ) -> None: ...

class TextMetadata:
    line_count: int
    word_count: int
    character_count: int
    headers: list[str] | None
    links: list[list[str]] | None
    code_blocks: list[list[str]] | None
    def __init__(
        self,
        line_count: int | None = None,
        word_count: int | None = None,
        character_count: int | None = None,
        headers: list[str] | None = None,
        links: list[list[str]] | None = None,
        code_blocks: list[list[str]] | None = None,
    ) -> None: ...

class HeaderMetadata:
    level: int
    text: str
    id: str | None
    depth: int
    html_offset: int
    def __init__(
        self,
        level: int,
        text: str,
        depth: int,
        html_offset: int,
        id: str | None = None,  # noqa: A002
    ) -> None: ...

class LinkMetadata:
    href: str
    text: str
    title: str | None
    link_type: LinkType
    rel: list[str]
    attributes: list[list[str]]
    def __init__(
        self,
        href: str,
        text: str,
        link_type: LinkType | str,
        rel: list[str],
        attributes: list[list[str]],
        title: str | None = None,
    ) -> None: ...

class ImageMetadataType:
    src: str
    alt: str | None
    title: str | None
    dimensions: list[int] | None
    image_type: ImageType
    attributes: list[list[str]]
    def __init__(
        self,
        src: str,
        image_type: ImageType | str,
        attributes: list[list[str]],
        alt: str | None = None,
        title: str | None = None,
        dimensions: list[int] | None = None,
    ) -> None: ...

class StructuredData:
    data_type: StructuredDataType
    raw_json: str
    schema_type: str | None
    def __init__(
        self,
        data_type: StructuredDataType | str,
        raw_json: str,
        schema_type: str | None = None,
    ) -> None: ...

class HtmlMetadata:
    title: str | None
    description: str | None
    keywords: list[str]
    author: str | None
    canonical_url: str | None
    base_href: str | None
    language: str | None
    text_direction: TextDirection | None
    open_graph: dict[str, str]
    twitter_card: dict[str, str]
    meta_tags: dict[str, str]
    headers: list[HeaderMetadata]
    links: list[LinkMetadata]
    images: list[ImageMetadataType]
    structured_data: list[StructuredData]
    def __init__(
        self,
        title: str | None = None,
        description: str | None = None,
        keywords: list[str] | None = None,
        author: str | None = None,
        canonical_url: str | None = None,
        base_href: str | None = None,
        language: str | None = None,
        text_direction: TextDirection | str | None = None,
        open_graph: dict[str, str] | None = None,
        twitter_card: dict[str, str] | None = None,
        meta_tags: dict[str, str] | None = None,
        headers: list[HeaderMetadata] | None = None,
        links: list[LinkMetadata] | None = None,
        images: list[ImageMetadataType] | None = None,
        structured_data: list[StructuredData] | None = None,
    ) -> None: ...

class OcrMetadata:
    language: str
    psm: int
    output_format: str
    table_count: int
    table_rows: int | None
    table_cols: int | None
    def __init__(
        self,
        language: str | None = None,
        psm: int | None = None,
        output_format: str | None = None,
        table_count: int | None = None,
        table_rows: int | None = None,
        table_cols: int | None = None,
    ) -> None: ...

class ErrorMetadata:
    error_type: str
    message: str
    def __init__(self, error_type: str, message: str) -> None: ...

class PptxMetadata:
    slide_count: int
    slide_names: list[str]
    image_count: int | None
    table_count: int | None
    def __init__(
        self,
        slide_count: int | None = None,
        slide_names: list[str] | None = None,
        image_count: int | None = None,
        table_count: int | None = None,
    ) -> None: ...

class DocxMetadata:
    core_properties: CoreProperties | None
    app_properties: DocxAppProperties | None
    custom_properties: dict[str, dict[str, Any]] | None
    def __init__(
        self,
        core_properties: CoreProperties | None = None,
        app_properties: DocxAppProperties | None = None,
        custom_properties: dict[str, dict[str, Any]] | None = None,
    ) -> None: ...

class CsvMetadata:
    row_count: int
    column_count: int
    delimiter: str | None
    has_header: bool
    column_types: list[str] | None
    def __init__(
        self,
        row_count: int | None = None,
        column_count: int | None = None,
        delimiter: str | None = None,
        has_header: bool | None = None,
        column_types: list[str] | None = None,
    ) -> None: ...

class BibtexMetadata:
    entry_count: int
    citation_keys: list[str]
    authors: list[str]
    year_range: YearRange | None
    entry_types: dict[str, int] | None
    def __init__(
        self,
        entry_count: int | None = None,
        citation_keys: list[str] | None = None,
        authors: list[str] | None = None,
        year_range: YearRange | None = None,
        entry_types: dict[str, int] | None = None,
    ) -> None: ...

class CitationMetadata:
    citation_count: int
    format: str | None
    authors: list[str]
    year_range: YearRange | None
    dois: list[str]
    keywords: list[str]
    def __init__(
        self,
        citation_count: int | None = None,
        format: str | None = None,  # noqa: A002
        authors: list[str] | None = None,
        year_range: YearRange | None = None,
        dois: list[str] | None = None,
        keywords: list[str] | None = None,
    ) -> None: ...

class YearRange:
    min: int | None
    max: int | None
    years: list[int]
    def __init__(
        self,
        years: list[int],
        min: int | None = None,  # noqa: A002
        max: int | None = None,  # noqa: A002
    ) -> None: ...

class FictionBookMetadata:
    genres: list[str]
    sequences: list[str]
    annotation: str | None
    def __init__(
        self,
        genres: list[str] | None = None,
        sequences: list[str] | None = None,
        annotation: str | None = None,
    ) -> None: ...

class DbfMetadata:
    record_count: int
    field_count: int
    fields: list[DbfFieldInfo]
    def __init__(
        self,
        record_count: int | None = None,
        field_count: int | None = None,
        fields: list[DbfFieldInfo] | None = None,
    ) -> None: ...

class DbfFieldInfo:
    name: str
    field_type: str
    def __init__(self, name: str, field_type: str) -> None: ...

class JatsMetadata:
    copyright: str | None
    license: str | None
    history_dates: dict[str, str]
    contributor_roles: list[ContributorRole]
    def __init__(
        self,
        copyright: str | None = None,  # noqa: A002
        license: str | None = None,  # noqa: A002
        history_dates: dict[str, str] | None = None,
        contributor_roles: list[ContributorRole] | None = None,
    ) -> None: ...

class ContributorRole:
    name: str
    role: str | None
    def __init__(self, name: str, role: str | None = None) -> None: ...

class EpubMetadata:
    coverage: str | None
    dc_format: str | None
    relation: str | None
    source: str | None
    dc_type: str | None
    cover_image: str | None
    def __init__(
        self,
        coverage: str | None = None,
        dc_format: str | None = None,
        relation: str | None = None,
        source: str | None = None,
        dc_type: str | None = None,
        cover_image: str | None = None,
    ) -> None: ...

class PstMetadata:
    message_count: int
    def __init__(self, message_count: int | None = None) -> None: ...

class OcrConfidence:
    detection: float | None
    recognition: float
    def __init__(
        self,
        detection: float | None = None,
        recognition: float | None = None,
    ) -> None: ...

class OcrRotation:
    angle_degrees: float
    confidence: float | None
    def __init__(self, angle_degrees: float, confidence: float | None = None) -> None: ...

class OcrElement:
    text: str
    geometry: OcrBoundingGeometry
    confidence: OcrConfidence
    level: OcrElementLevel
    rotation: OcrRotation | None
    page_number: int
    parent_id: str | None
    backend_metadata: dict[str, dict[str, Any]]
    def __init__(
        self,
        text: str | None = None,
        geometry: OcrBoundingGeometry | None = None,
        confidence: OcrConfidence | None = None,
        level: OcrElementLevel | str | None = None,
        rotation: OcrRotation | None = None,
        page_number: int | None = None,
        parent_id: str | None = None,
        backend_metadata: dict[str, dict[str, Any]] | None = None,
    ) -> None: ...

class OcrElementConfig:
    include_elements: bool
    min_level: OcrElementLevel
    min_confidence: float
    build_hierarchy: bool
    def __init__(
        self,
        include_elements: bool | None = None,
        min_level: OcrElementLevel | str | None = None,
        min_confidence: float | None = None,
        build_hierarchy: bool | None = None,
    ) -> None: ...

class PageStructure:
    total_count: int
    unit_type: PageUnitType
    boundaries: list[PageBoundary] | None
    pages: list[PageInfo] | None
    def __init__(
        self,
        total_count: int,
        unit_type: PageUnitType | str,
        boundaries: list[PageBoundary] | None = None,
        pages: list[PageInfo] | None = None,
    ) -> None: ...

class PageBoundary:
    byte_start: int
    byte_end: int
    page_number: int
    def __init__(self, byte_start: int, byte_end: int, page_number: int) -> None: ...

class PageInfo:
    number: int
    title: str | None
    dimensions: list[float] | None
    image_count: int | None
    table_count: int | None
    hidden: bool | None
    is_blank: bool | None
    has_vector_graphics: bool
    def __init__(
        self,
        number: int,
        has_vector_graphics: bool,
        title: str | None = None,
        dimensions: list[float] | None = None,
        image_count: int | None = None,
        table_count: int | None = None,
        hidden: bool | None = None,
        is_blank: bool | None = None,
    ) -> None: ...

class PageContent:
    page_number: int
    content: str
    tables: list[Table]
    image_indices: list[int]
    hierarchy: PageHierarchy | None
    is_blank: bool | None
    layout_regions: list[LayoutRegion] | None
    speaker_notes: str | None
    section_name: str | None
    sheet_name: str | None
    def __init__(
        self,
        page_number: int,
        content: str,
        tables: list[Table],
        image_indices: list[int],
        hierarchy: PageHierarchy | None = None,
        is_blank: bool | None = None,
        layout_regions: list[LayoutRegion] | None = None,
        speaker_notes: str | None = None,
        section_name: str | None = None,
        sheet_name: str | None = None,
    ) -> None: ...

class LayoutRegion:
    class_name: str
    confidence: float
    bounding_box: BoundingBox
    area_fraction: float
    def __init__(
        self,
        class_name: str | None = None,
        confidence: float | None = None,
        bounding_box: BoundingBox | None = None,
        area_fraction: float | None = None,
    ) -> None: ...

class PageHierarchy:
    block_count: int
    blocks: list[HierarchicalBlock]
    def __init__(self, block_count: int, blocks: list[HierarchicalBlock]) -> None: ...

class HierarchicalBlock:
    text: str
    font_size: float
    level: str
    bbox: list[float] | None
    def __init__(
        self,
        text: str,
        font_size: float,
        level: str,
        bbox: list[float] | None = None,
    ) -> None: ...

class CellChange:
    row: int
    col: int
    from_: str
    to: str
    def __init__(self, row: int, col: int, from_: str, to: str) -> None: ...

class DocumentRevision:
    revision_id: str
    author: str | None
    timestamp: str | None
    kind: RevisionKind
    anchor: RevisionAnchor | None
    delta: RevisionDelta
    def __init__(
        self,
        revision_id: str,
        kind: RevisionKind | str,
        delta: RevisionDelta,
        author: str | None = None,
        timestamp: str | None = None,
        anchor: RevisionAnchor | None = None,
    ) -> None: ...

class RevisionDelta:
    content: list[DiffLine]
    table_changes: list[CellChange]
    def __init__(
        self,
        content: list[DiffLine] | None = None,
        table_changes: list[CellChange] | None = None,
    ) -> None: ...

class Table:
    cells: list[list[str]]
    markdown: str
    page_number: int
    bounding_box: BoundingBox | None
    def __init__(
        self,
        cells: list[list[str]] | None = None,
        markdown: str | None = None,
        page_number: int | None = None,
        bounding_box: BoundingBox | None = None,
    ) -> None: ...

class TableCell:
    content: str
    row_span: int
    col_span: int
    is_header: bool
    def __init__(
        self,
        content: str | None = None,
        row_span: int | None = None,
        col_span: int | None = None,
        is_header: bool | None = None,
    ) -> None: ...

class ExtractedUri:
    url: str
    label: str | None
    page: int | None
    kind: UriKind
    def __init__(
        self,
        url: str,
        kind: UriKind | str,
        label: str | None = None,
        page: int | None = None,
    ) -> None: ...

class DetectResponse:
    mime_type: str
    filename: str | None
    def __init__(self, mime_type: str, filename: str | None = None) -> None: ...

class DiffOptions:
    include_metadata: bool
    include_embedded: bool
    max_content_chars: int | None
    def __init__(
        self,
        include_metadata: bool | None = None,
        include_embedded: bool | None = None,
        max_content_chars: int | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> DiffOptions: ...

class ExtractionDiff:
    content_diff: list[DiffHunk]
    tables_added: list[Table]
    tables_removed: list[Table]
    tables_changed: list[TableDiff]
    metadata_changed: dict[str, Any]
    embedded_changes: EmbeddedChanges
    def __init__(
        self,
        content_diff: list[DiffHunk],
        tables_added: list[Table],
        tables_removed: list[Table],
        tables_changed: list[TableDiff],
        metadata_changed: dict[str, Any],
        embedded_changes: EmbeddedChanges,
    ) -> None: ...

class DiffHunk:
    from_line: int
    from_count: int
    to_line: int
    to_count: int
    lines: list[DiffLine]
    def __init__(
        self,
        from_line: int,
        from_count: int,
        to_line: int,
        to_count: int,
        lines: list[DiffLine],
    ) -> None: ...

class TableDiff:
    from_index: int
    to_index: int
    cell_changes: list[CellChange]
    def __init__(self, from_index: int, to_index: int, cell_changes: list[CellChange]) -> None: ...

class EmbeddedChanges:
    added: list[ArchiveEntry]
    removed: list[ArchiveEntry]
    changed: list[EmbeddedDiff]
    def __init__(
        self,
        added: list[ArchiveEntry],
        removed: list[ArchiveEntry],
        changed: list[EmbeddedDiff],
    ) -> None: ...

class EmbeddedDiff:
    path: str
    diff: ExtractionDiff
    def __init__(self, path: str, diff: ExtractionDiff) -> None: ...

class EmbeddingPreset:
    name: str
    chunk_size: int
    overlap: int
    model_repo: str
    pooling: str
    model_file: str
    dimensions: int
    description: str
    def __init__(
        self,
        name: str,
        chunk_size: int,
        overlap: int,
        model_repo: str,
        pooling: str,
        model_file: str,
        dimensions: int,
        description: str,
    ) -> None: ...

class YakeParams:
    window_size: int
    def __init__(self, window_size: int | None = None) -> None: ...
    @staticmethod
    def default() -> YakeParams: ...

class RakeParams:
    min_word_length: int
    max_words_per_phrase: int
    def __init__(
        self,
        min_word_length: int | None = None,
        max_words_per_phrase: int | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> RakeParams: ...

class KeywordConfig:
    algorithm: KeywordAlgorithm
    max_keywords: int
    min_score: float
    ngram_range: list[int]
    language: str | None
    yake_params: YakeParams | None
    rake_params: RakeParams | None
    def __init__(
        self,
        algorithm: KeywordAlgorithm | str | None = None,
        max_keywords: int | None = None,
        min_score: float | None = None,
        ngram_range: list[int] | None = None,
        language: str | None = None,
    ) -> None: ...
    @staticmethod
    def default() -> KeywordConfig: ...

class Keyword:
    text: str
    score: float
    algorithm: KeywordAlgorithm
    positions: list[int] | None
    def __init__(
        self,
        text: str,
        score: float,
        algorithm: KeywordAlgorithm | str,
        positions: list[int] | None = None,
    ) -> None: ...

class PaddleOcrConfig:
    language: str
    cache_dir: str | None
    use_angle_cls: bool
    enable_table_detection: bool
    det_db_thresh: float
    det_db_box_thresh: float
    det_db_unclip_ratio: float
    det_limit_side_len: int
    rec_batch_num: int
    padding: int
    drop_score: float
    model_tier: str
    def __init__(
        self,
        language: str | None = None,
        cache_dir: str | None = None,
        use_angle_cls: bool | None = None,
        enable_table_detection: bool | None = None,
        det_db_thresh: float | None = None,
        det_db_box_thresh: float | None = None,
        det_db_unclip_ratio: float | None = None,
        det_limit_side_len: int | None = None,
        rec_batch_num: int | None = None,
        padding: int | None = None,
        drop_score: float | None = None,
        model_tier: str | None = None,
    ) -> None: ...
    def with_cache_dir(self, path: str) -> PaddleOcrConfig: ...
    def with_table_detection(self, enable: bool) -> PaddleOcrConfig: ...
    def with_angle_cls(self, enable: bool) -> PaddleOcrConfig: ...
    def with_det_db_thresh(self, threshold: float) -> PaddleOcrConfig: ...
    def with_det_db_box_thresh(self, threshold: float) -> PaddleOcrConfig: ...
    def with_det_db_unclip_ratio(self, ratio: float) -> PaddleOcrConfig: ...
    def with_det_limit_side_len(self, length: int) -> PaddleOcrConfig: ...
    def with_rec_batch_num(self, batch_size: int) -> PaddleOcrConfig: ...
    def with_drop_score(self, score: float) -> PaddleOcrConfig: ...
    def with_padding(self, padding: int) -> PaddleOcrConfig: ...
    def with_model_tier(self, tier: str) -> PaddleOcrConfig: ...
    @staticmethod
    def default() -> PaddleOcrConfig: ...

class ModelPaths:
    det_model: str
    cls_model: str
    rec_model: str
    dict_file: str
    def __init__(self, det_model: str, cls_model: str, rec_model: str, dict_file: str) -> None: ...

class OrientationResult:
    degrees: int
    confidence: float
    def __init__(self, degrees: int, confidence: float) -> None: ...

class BBox:
    x1: float
    y1: float
    x2: float
    y2: float
    def __init__(self, x1: float, y1: float, x2: float, y2: float) -> None: ...

class LayoutDetection:
    class_name: LayoutClass
    confidence: float
    bbox: BBox
    def __init__(self, class_name: LayoutClass | str, confidence: float, bbox: BBox) -> None: ...

class RecognizedTable:
    detection_bbox: BBox
    cells: list[list[str]]
    markdown: str
    def __init__(self, detection_bbox: BBox, cells: list[list[str]], markdown: str) -> None: ...

class DetectionResult:
    page_width: int
    page_height: int
    detections: list[LayoutDetection]
    def __init__(
        self,
        page_width: int,
        page_height: int,
        detections: list[LayoutDetection],
    ) -> None: ...

class EmbeddedFile:
    name: str
    data: bytes
    compressed_size: int
    mime_type: str | None
    def __init__(
        self,
        name: str,
        data: bytes,
        compressed_size: int,
        mime_type: str | None = None,
    ) -> None: ...

class PdfMetadata:
    pdf_version: str | None
    producer: str | None
    is_encrypted: bool | None
    width: int | None
    height: int | None
    page_count: int | None
    def __init__(
        self,
        pdf_version: str | None = None,
        producer: str | None = None,
        is_encrypted: bool | None = None,
        width: int | None = None,
        height: int | None = None,
        page_count: int | None = None,
    ) -> None: ...

class ExecutionProviderType:
    auto: ExecutionProviderType = ...
    cpu: ExecutionProviderType = ...
    core_ml: ExecutionProviderType = ...
    cuda: ExecutionProviderType = ...
    tensor_rt: ExecutionProviderType = ...
    def __init__(self, value: int | str) -> None: ...

class OutputFormatPlainVariant(TypedDict):
    type: Literal["plain"]

class OutputFormatMarkdownVariant(TypedDict):
    type: Literal["markdown"]

class OutputFormatDjotVariant(TypedDict):
    type: Literal["djot"]

class OutputFormatHtmlVariant(TypedDict):
    type: Literal["html"]

class OutputFormatJsonVariant(TypedDict):
    type: Literal["json"]

class OutputFormatStructuredVariant(TypedDict):
    type: Literal["structured"]

class OutputFormatCustomVariant(TypedDict):
    type: Literal["custom"]
    _0: str

class OutputFormat:
    type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class HtmlTheme:
    default: HtmlTheme = ...
    git_hub: HtmlTheme = ...
    dark: HtmlTheme = ...
    light: HtmlTheme = ...
    unstyled: HtmlTheme = ...
    def __init__(self, value: int | str) -> None: ...

class TableModel:
    tatr: TableModel = ...
    slanet_wired: TableModel = ...
    slanet_wireless: TableModel = ...
    slanet_plus: TableModel = ...
    slanet_auto: TableModel = ...
    disabled: TableModel = ...
    def __init__(self, value: int | str) -> None: ...

class ChunkerType:
    text: ChunkerType = ...
    markdown: ChunkerType = ...
    yaml: ChunkerType = ...
    semantic: ChunkerType = ...
    def __init__(self, value: int | str) -> None: ...

class ChunkSizingCharactersVariant(TypedDict):
    type: Literal["characters"]

class ChunkSizingTokenizerVariant(TypedDict):
    type: Literal["tokenizer"]
    model: str
    cache_dir: str | None

class ChunkSizing:
    type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class EmbeddingModelTypePresetVariant(TypedDict):
    type: Literal["preset"]
    name: str

class EmbeddingModelTypeCustomVariant(TypedDict):
    type: Literal["custom"]
    model_id: str
    dimensions: int

class EmbeddingModelTypeLlmVariant(TypedDict):
    type: Literal["llm"]
    llm: LlmConfig

class EmbeddingModelTypePluginVariant(TypedDict):
    type: Literal["plugin"]
    name: str

class EmbeddingModelType:
    type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class CodeContentMode:
    chunks: CodeContentMode = ...
    raw: CodeContentMode = ...
    structure: CodeContentMode = ...
    def __init__(self, value: int | str) -> None: ...

class ListType:
    bullet: ListType = ...
    numbered: ListType = ...
    lettered: ListType = ...
    indented: ListType = ...
    def __init__(self, value: int | str) -> None: ...

class OcrBackendType:
    tesseract: OcrBackendType = ...
    easy_ocr: OcrBackendType = ...
    paddle_ocr: OcrBackendType = ...
    custom: OcrBackendType = ...
    def __init__(self, value: int | str) -> None: ...

class ProcessingStage:
    early: ProcessingStage = ...
    middle: ProcessingStage = ...
    late: ProcessingStage = ...
    def __init__(self, value: int | str) -> None: ...

class ReductionLevel:
    off: ReductionLevel = ...
    light: ReductionLevel = ...
    moderate: ReductionLevel = ...
    aggressive: ReductionLevel = ...
    maximum: ReductionLevel = ...
    def __init__(self, value: int | str) -> None: ...

class PdfAnnotationType:
    text: PdfAnnotationType = ...
    highlight: PdfAnnotationType = ...
    link: PdfAnnotationType = ...
    stamp: PdfAnnotationType = ...
    underline: PdfAnnotationType = ...
    strike_out: PdfAnnotationType = ...
    other: PdfAnnotationType = ...
    def __init__(self, value: int | str) -> None: ...

class BlockType:
    paragraph: BlockType = ...
    heading: BlockType = ...
    blockquote: BlockType = ...
    code_block: BlockType = ...
    list_item: BlockType = ...
    ordered_list: BlockType = ...
    bullet_list: BlockType = ...
    task_list: BlockType = ...
    definition_list: BlockType = ...
    definition_term: BlockType = ...
    definition_description: BlockType = ...
    div: BlockType = ...
    section: BlockType = ...
    thematic_break: BlockType = ...
    raw_block: BlockType = ...
    math_display: BlockType = ...
    def __init__(self, value: int | str) -> None: ...

class InlineType:
    text: InlineType = ...
    strong: InlineType = ...
    emphasis: InlineType = ...
    highlight: InlineType = ...
    subscript: InlineType = ...
    superscript: InlineType = ...
    insert: InlineType = ...
    delete: InlineType = ...
    code: InlineType = ...
    link: InlineType = ...
    image: InlineType = ...
    span: InlineType = ...
    math: InlineType = ...
    raw_inline: InlineType = ...
    footnote_ref: InlineType = ...
    symbol: InlineType = ...
    def __init__(self, value: int | str) -> None: ...

class RelationshipKind:
    footnote_reference: RelationshipKind = ...
    citation_reference: RelationshipKind = ...
    internal_link: RelationshipKind = ...
    caption: RelationshipKind = ...
    label: RelationshipKind = ...
    toc_entry: RelationshipKind = ...
    cross_reference: RelationshipKind = ...
    def __init__(self, value: int | str) -> None: ...

class ContentLayer:
    body: ContentLayer = ...
    header: ContentLayer = ...
    footer: ContentLayer = ...
    footnote: ContentLayer = ...
    def __init__(self, value: int | str) -> None: ...

class NodeContentTitleVariant(TypedDict):
    node_type: Literal["title"]
    text: str

class NodeContentHeadingVariant(TypedDict):
    node_type: Literal["heading"]
    level: int
    text: str

class NodeContentParagraphVariant(TypedDict):
    node_type: Literal["paragraph"]
    text: str

class NodeContentListVariant(TypedDict):
    node_type: Literal["list"]
    ordered: bool

class NodeContentListItemVariant(TypedDict):
    node_type: Literal["list_item"]
    text: str

class NodeContentTableVariant(TypedDict):
    node_type: Literal["table"]
    grid: TableGrid

class NodeContentImageVariant(TypedDict):
    node_type: Literal["image"]
    description: str | None
    image_index: int | None
    src: str | None

class NodeContentCodeVariant(TypedDict):
    node_type: Literal["code"]
    text: str
    language: str | None

class NodeContentQuoteVariant(TypedDict):
    node_type: Literal["quote"]

class NodeContentFormulaVariant(TypedDict):
    node_type: Literal["formula"]
    text: str

class NodeContentFootnoteVariant(TypedDict):
    node_type: Literal["footnote"]
    text: str

class NodeContentGroupVariant(TypedDict):
    node_type: Literal["group"]
    label: str | None
    heading_level: int | None
    heading_text: str | None

class NodeContentPageBreakVariant(TypedDict):
    node_type: Literal["page_break"]

class NodeContentSlideVariant(TypedDict):
    node_type: Literal["slide"]
    number: int
    title: str | None

class NodeContentDefinitionListVariant(TypedDict):
    node_type: Literal["definition_list"]

class NodeContentDefinitionItemVariant(TypedDict):
    node_type: Literal["definition_item"]
    term: str
    definition: str

class NodeContentCitationVariant(TypedDict):
    node_type: Literal["citation"]
    key: str
    text: str

class NodeContentAdmonitionVariant(TypedDict):
    node_type: Literal["admonition"]
    kind: str
    title: str | None

class NodeContentRawBlockVariant(TypedDict):
    node_type: Literal["raw_block"]
    format: str
    content: str

class NodeContentMetadataBlockVariant(TypedDict):
    node_type: Literal["metadata_block"]
    entries: list[list[str]]

class NodeContent:
    node_type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class AnnotationKindBoldVariant(TypedDict):
    annotation_type: Literal["bold"]

class AnnotationKindItalicVariant(TypedDict):
    annotation_type: Literal["italic"]

class AnnotationKindUnderlineVariant(TypedDict):
    annotation_type: Literal["underline"]

class AnnotationKindStrikethroughVariant(TypedDict):
    annotation_type: Literal["strikethrough"]

class AnnotationKindCodeVariant(TypedDict):
    annotation_type: Literal["code"]

class AnnotationKindSubscriptVariant(TypedDict):
    annotation_type: Literal["subscript"]

class AnnotationKindSuperscriptVariant(TypedDict):
    annotation_type: Literal["superscript"]

class AnnotationKindLinkVariant(TypedDict):
    annotation_type: Literal["link"]
    url: str
    title: str | None

class AnnotationKindHighlightVariant(TypedDict):
    annotation_type: Literal["highlight"]

class AnnotationKindColorVariant(TypedDict):
    annotation_type: Literal["color"]
    value: str

class AnnotationKindFontSizeVariant(TypedDict):
    annotation_type: Literal["font_size"]
    value: str

class AnnotationKindCustomVariant(TypedDict):
    annotation_type: Literal["custom"]
    name: str
    value: str | None

class AnnotationKind:
    annotation_type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class ExtractionMethod:
    native: ExtractionMethod = ...
    ocr: ExtractionMethod = ...
    mixed: ExtractionMethod = ...
    def __init__(self, value: int | str) -> None: ...

class ChunkType:
    heading: ChunkType = ...
    party_list: ChunkType = ...
    definitions: ChunkType = ...
    operative_clause: ChunkType = ...
    signature_block: ChunkType = ...
    schedule: ChunkType = ...
    table_like: ChunkType = ...
    formula: ChunkType = ...
    code_block: ChunkType = ...
    image: ChunkType = ...
    org_chart: ChunkType = ...
    diagram: ChunkType = ...
    unknown: ChunkType = ...
    def __init__(self, value: int | str) -> None: ...

class ImageKind:
    photograph: ImageKind = ...
    diagram: ImageKind = ...
    chart: ImageKind = ...
    drawing: ImageKind = ...
    text_block: ImageKind = ...
    decoration: ImageKind = ...
    logo: ImageKind = ...
    icon: ImageKind = ...
    tile_fragment: ImageKind = ...
    mask: ImageKind = ...
    page_raster: ImageKind = ...
    unknown: ImageKind = ...
    def __init__(self, value: int | str) -> None: ...

class ResultFormat:
    unified: ResultFormat = ...
    element_based: ResultFormat = ...
    def __init__(self, value: int | str) -> None: ...

class ElementType:
    title_: ElementType = ...
    narrative_text: ElementType = ...
    heading: ElementType = ...
    list_item: ElementType = ...
    table: ElementType = ...
    image: ElementType = ...
    page_break: ElementType = ...
    code_block: ElementType = ...
    block_quote: ElementType = ...
    footer: ElementType = ...
    header: ElementType = ...
    def __init__(self, value: int | str) -> None: ...

class FormatMetadataPdfVariant(TypedDict):
    format_type: Literal["pdf"]
    _0: PdfMetadata

class FormatMetadataDocxVariant(TypedDict):
    format_type: Literal["docx"]
    _0: DocxMetadata

class FormatMetadataExcelVariant(TypedDict):
    format_type: Literal["excel"]
    _0: ExcelMetadata

class FormatMetadataEmailVariant(TypedDict):
    format_type: Literal["email"]
    _0: EmailMetadata

class FormatMetadataPptxVariant(TypedDict):
    format_type: Literal["pptx"]
    _0: PptxMetadata

class FormatMetadataArchiveVariant(TypedDict):
    format_type: Literal["archive"]
    _0: ArchiveMetadata

class FormatMetadataImageVariant(TypedDict):
    format_type: Literal["image"]
    _0: ImageMetadata

class FormatMetadataXmlVariant(TypedDict):
    format_type: Literal["xml"]
    _0: XmlMetadata

class FormatMetadataTextVariant(TypedDict):
    format_type: Literal["text"]
    _0: TextMetadata

class FormatMetadataHtmlVariant(TypedDict):
    format_type: Literal["html"]
    _0: HtmlMetadata

class FormatMetadataOcrVariant(TypedDict):
    format_type: Literal["ocr"]
    _0: OcrMetadata

class FormatMetadataCsvVariant(TypedDict):
    format_type: Literal["csv"]
    _0: CsvMetadata

class FormatMetadataBibtexVariant(TypedDict):
    format_type: Literal["bibtex"]
    _0: BibtexMetadata

class FormatMetadataCitationVariant(TypedDict):
    format_type: Literal["citation"]
    _0: CitationMetadata

class FormatMetadataFictionBookVariant(TypedDict):
    format_type: Literal["fiction_book"]
    _0: FictionBookMetadata

class FormatMetadataDbfVariant(TypedDict):
    format_type: Literal["dbf"]
    _0: DbfMetadata

class FormatMetadataJatsVariant(TypedDict):
    format_type: Literal["jats"]
    _0: JatsMetadata

class FormatMetadataEpubVariant(TypedDict):
    format_type: Literal["epub"]
    _0: EpubMetadata

class FormatMetadataPstVariant(TypedDict):
    format_type: Literal["pst"]
    _0: PstMetadata

class FormatMetadataCodeVariant(TypedDict):
    format_type: Literal["code"]
    _0: str

class FormatMetadata:
    format_type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class TextDirection:
    left_to_right: TextDirection = ...
    right_to_left: TextDirection = ...
    auto: TextDirection = ...
    def __init__(self, value: int | str) -> None: ...

class LinkType:
    anchor: LinkType = ...
    internal: LinkType = ...
    external: LinkType = ...
    email: LinkType = ...
    phone: LinkType = ...
    other: LinkType = ...
    def __init__(self, value: int | str) -> None: ...

class ImageType:
    data_uri: ImageType = ...
    inline_svg: ImageType = ...
    external: ImageType = ...
    relative: ImageType = ...
    def __init__(self, value: int | str) -> None: ...

class StructuredDataType:
    json_ld: StructuredDataType = ...
    microdata: StructuredDataType = ...
    rd_fa: StructuredDataType = ...
    def __init__(self, value: int | str) -> None: ...

class OcrBoundingGeometryRectangleVariant(TypedDict):
    type: Literal["rectangle"]
    left: int
    top: int
    width: int
    height: int

class OcrBoundingGeometryQuadrilateralVariant(TypedDict):
    type: Literal["quadrilateral"]
    points: str

class OcrBoundingGeometry:
    type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class OcrElementLevel:
    word: OcrElementLevel = ...
    line: OcrElementLevel = ...
    block: OcrElementLevel = ...
    page: OcrElementLevel = ...
    def __init__(self, value: int | str) -> None: ...

class PageUnitType:
    page: PageUnitType = ...
    slide: PageUnitType = ...
    sheet: PageUnitType = ...
    def __init__(self, value: int | str) -> None: ...

class DiffLineContextVariant(TypedDict):
    kind: Literal["context"]
    _0: str

class DiffLineAddedVariant(TypedDict):
    kind: Literal["added"]
    _0: str

class DiffLineRemovedVariant(TypedDict):
    kind: Literal["removed"]
    _0: str

class DiffLine:
    kind: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class RevisionKind:
    insertion: RevisionKind = ...
    deletion: RevisionKind = ...
    format_change: RevisionKind = ...
    comment: RevisionKind = ...
    def __init__(self, value: int | str) -> None: ...

class RevisionAnchorParagraphVariant(TypedDict):
    type: Literal["paragraph"]
    index: int

class RevisionAnchorTableCellVariant(TypedDict):
    type: Literal["table_cell"]
    row: int
    col: int
    table_index: int

class RevisionAnchorPageVariant(TypedDict):
    type: Literal["page"]
    index: int

class RevisionAnchorSlideVariant(TypedDict):
    type: Literal["slide"]
    index: int

class RevisionAnchorSheetVariant(TypedDict):
    type: Literal["sheet"]
    index: int
    name: str | None

class RevisionAnchor:
    type: str
    def __str__(self) -> str: ...  # noqa: PYI029
    def __repr__(self) -> str: ...  # noqa: PYI029

class UriKind:
    hyperlink: UriKind = ...
    image: UriKind = ...
    anchor: UriKind = ...
    citation: UriKind = ...
    reference: UriKind = ...
    email: UriKind = ...
    def __init__(self, value: int | str) -> None: ...

class KeywordAlgorithm:
    yake: KeywordAlgorithm = ...
    rake: KeywordAlgorithm = ...
    def __init__(self, value: int | str) -> None: ...

class PSMMode:
    osd_only: PSMMode = ...
    auto_osd: PSMMode = ...
    auto_only: PSMMode = ...
    auto: PSMMode = ...
    single_column: PSMMode = ...
    single_block_vertical: PSMMode = ...
    single_block: PSMMode = ...
    single_line: PSMMode = ...
    single_word: PSMMode = ...
    circle_word: PSMMode = ...
    single_char: PSMMode = ...
    def __init__(self, value: int | str) -> None: ...

class PaddleLanguage:
    english: PaddleLanguage = ...
    chinese: PaddleLanguage = ...
    japanese: PaddleLanguage = ...
    korean: PaddleLanguage = ...
    german: PaddleLanguage = ...
    french: PaddleLanguage = ...
    latin: PaddleLanguage = ...
    cyrillic: PaddleLanguage = ...
    traditional_chinese: PaddleLanguage = ...
    thai: PaddleLanguage = ...
    greek: PaddleLanguage = ...
    east_slavic: PaddleLanguage = ...
    arabic: PaddleLanguage = ...
    devanagari: PaddleLanguage = ...
    tamil: PaddleLanguage = ...
    telugu: PaddleLanguage = ...
    def __init__(self, value: int | str) -> None: ...

class LayoutClass:
    caption: LayoutClass = ...
    footnote: LayoutClass = ...
    formula: LayoutClass = ...
    list_item: LayoutClass = ...
    page_footer: LayoutClass = ...
    page_header: LayoutClass = ...
    picture: LayoutClass = ...
    section_header: LayoutClass = ...
    table: LayoutClass = ...
    text: LayoutClass = ...
    title_: LayoutClass = ...
    document_index: LayoutClass = ...
    code: LayoutClass = ...
    checkbox_selected: LayoutClass = ...
    checkbox_unselected: LayoutClass = ...
    form: LayoutClass = ...
    key_value_region: LayoutClass = ...
    def __init__(self, value: int | str) -> None: ...

async def extract_bytes(
    content: bytes,
    mime_type: str,
    config: ExtractionConfig,
) -> ExtractionResult: ...

async def extract_file(
    path: str,
    config: ExtractionConfig,
    mime_type: str | None = None,
) -> ExtractionResult: ...

def extract_file_sync(
    path: str,
    config: ExtractionConfig,
    mime_type: str | None = None,
) -> ExtractionResult: ...

def extract_bytes_sync(
    content: bytes,
    mime_type: str,
    config: ExtractionConfig,
) -> ExtractionResult: ...

def batch_extract_files_sync(
    items: list[BatchFileItem],
    config: ExtractionConfig,
) -> list[ExtractionResult]: ...

def batch_extract_bytes_sync(
    items: list[BatchBytesItem],
    config: ExtractionConfig,
) -> list[ExtractionResult]: ...

async def batch_extract_files(
    items: list[BatchFileItem],
    config: ExtractionConfig,
) -> list[ExtractionResult]: ...

async def batch_extract_bytes(
    items: list[BatchBytesItem],
    config: ExtractionConfig,
) -> list[ExtractionResult]: ...

def detect_mime_type_from_bytes(content: bytes) -> str: ...
def get_extensions_for_mime(mime_type: str) -> list[str]: ...
def clear_embedding_backends() -> None: ...
def list_embedding_backends() -> list[str]: ...
def list_document_extractors() -> list[str]: ...
def clear_document_extractors() -> None: ...
def list_ocr_backends() -> list[str]: ...
def clear_ocr_backends() -> None: ...
def list_post_processors() -> list[str]: ...
def clear_post_processors() -> None: ...
def list_renderers() -> list[str]: ...
def clear_renderers() -> None: ...
def list_validators() -> list[str]: ...
def clear_validators() -> None: ...
def compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) -> ExtractionDiff: ...
async def embed_texts_async(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
def render_pdf_page_to_png(
    pdf_bytes: bytes,
    page_index: int,
    dpi: int | None = None,
    password: str | None = None,
) -> bytes: ...

def detect_mime_type(path: str, check_exists: bool) -> str: ...
def embed_texts(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
def get_embedding_preset(name: str) -> EmbeddingPreset | None: ...
def list_embedding_presets() -> list[str]: ...
def register_ocr_backend(backend: object) -> None: ...
def unregister_ocr_backend(name: str) -> None: ...
def clear_ocr_backends() -> None: ...
def register_post_processor(backend: object) -> None: ...
def unregister_post_processor(name: str) -> None: ...
def clear_post_processors() -> None: ...
def register_validator(backend: object) -> None: ...
def unregister_validator(name: str) -> None: ...
def clear_validators() -> None: ...
def register_embedding_backend(backend: object) -> None: ...
def unregister_embedding_backend(name: str) -> None: ...
def clear_embedding_backends() -> None: ...
def register_document_extractor(backend: object) -> None: ...
def unregister_document_extractor(name: str) -> None: ...
def clear_document_extractors() -> None: ...
def register_renderer(backend: object) -> None: ...
def unregister_renderer(name: str) -> None: ...
def clear_renderers() -> None: ...