# This file is auto-generated by alef — DO NOT EDIT. # alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 # To regenerate: alef generate # To verify freshness: alef verify --exit-code # Issues & docs: https://github.com/kreuzberg-dev/alef from typing import Any, Literal, TypedDict class CacheStats: total_files: int total_size_mb: float available_space_mb: float oldest_file_age_days: float newest_file_age_days: float def __init__( self, total_files: int, total_size_mb: float, available_space_mb: float, oldest_file_age_days: float, newest_file_age_days: float, ) -> None: ... class AccelerationConfig: provider: ExecutionProviderType device_id: int def __init__( self, provider: ExecutionProviderType | str | None = None, device_id: int | None = None, ) -> None: ... class ContentFilterConfig: include_headers: bool include_footers: bool strip_repeating_text: bool include_watermarks: bool def __init__( self, include_headers: bool | None = None, include_footers: bool | None = None, strip_repeating_text: bool | None = None, include_watermarks: bool | None = None, ) -> None: ... @staticmethod def default() -> ContentFilterConfig: ... class EmailConfig: msg_fallback_codepage: int | None def __init__(self, msg_fallback_codepage: int | None = None) -> None: ... class ExtractionConfig: use_cache: bool enable_quality_processing: bool ocr: OcrConfig | None force_ocr: bool force_ocr_pages: list[int] | None disable_ocr: bool chunking: ChunkingConfig | None content_filter: ContentFilterConfig | None images: ImageExtractionConfig | None pdf_options: PdfConfig | None token_reduction: TokenReductionOptions | None language_detection: LanguageDetectionConfig | None pages: PageConfig | None keywords: KeywordConfig | None postprocessor: PostProcessorConfig | None html_options: str | None html_output: HtmlOutputConfig | None extraction_timeout_secs: int | None max_concurrent_extractions: int | None result_format: ResultFormat security_limits: SecurityLimits | None max_embedded_file_bytes: int | None output_format: OutputFormat layout: LayoutDetectionConfig | None use_layout_for_markdown: bool include_document_structure: bool acceleration: AccelerationConfig | None cache_namespace: str | None cache_ttl_secs: int | None email: EmailConfig | None concurrency: str | None max_archive_depth: int tree_sitter: TreeSitterConfig | None structured_extraction: StructuredExtractionConfig | None cancel_token: str | None def __init__( self, use_cache: bool | None = None, enable_quality_processing: bool | None = None, ocr: OcrConfig | None = None, force_ocr: bool | None = None, force_ocr_pages: list[int] | None = None, disable_ocr: bool | None = None, chunking: ChunkingConfig | None = None, content_filter: ContentFilterConfig | None = None, images: ImageExtractionConfig | None = None, token_reduction: TokenReductionOptions | None = None, language_detection: LanguageDetectionConfig | None = None, pages: PageConfig | None = None, postprocessor: PostProcessorConfig | None = None, extraction_timeout_secs: int | None = None, max_concurrent_extractions: int | None = None, result_format: ResultFormat | str | None = None, security_limits: SecurityLimits | None = None, max_embedded_file_bytes: int | None = None, output_format: OutputFormat | None = None, use_layout_for_markdown: bool | None = None, include_document_structure: bool | None = None, acceleration: AccelerationConfig | None = None, cache_namespace: str | None = None, cache_ttl_secs: int | None = None, email: EmailConfig | None = None, concurrency: str | None = None, max_archive_depth: int | None = None, structured_extraction: StructuredExtractionConfig | None = None, cancel_token: str | None = None, ) -> None: ... def needs_image_processing(self) -> bool: ... @staticmethod def default() -> ExtractionConfig: ... class FileExtractionConfig: enable_quality_processing: bool | None ocr: OcrConfig | None force_ocr: bool | None force_ocr_pages: list[int] | None disable_ocr: bool | None chunking: ChunkingConfig | None content_filter: ContentFilterConfig | None images: ImageExtractionConfig | None pdf_options: PdfConfig | None token_reduction: TokenReductionOptions | None language_detection: LanguageDetectionConfig | None pages: PageConfig | None keywords: KeywordConfig | None postprocessor: PostProcessorConfig | None html_options: str | None result_format: ResultFormat | None output_format: OutputFormat | None include_document_structure: bool | None layout: LayoutDetectionConfig | None timeout_secs: int | None tree_sitter: TreeSitterConfig | None structured_extraction: StructuredExtractionConfig | None def __init__( self, enable_quality_processing: bool | None = None, ocr: OcrConfig | None = None, force_ocr: bool | None = None, force_ocr_pages: list[int] | None = None, disable_ocr: bool | None = None, chunking: ChunkingConfig | None = None, content_filter: ContentFilterConfig | None = None, images: ImageExtractionConfig | None = None, token_reduction: TokenReductionOptions | None = None, language_detection: LanguageDetectionConfig | None = None, pages: PageConfig | None = None, postprocessor: PostProcessorConfig | None = None, result_format: ResultFormat | str | None = None, output_format: OutputFormat | None = None, include_document_structure: bool | None = None, timeout_secs: int | None = None, structured_extraction: StructuredExtractionConfig | None = None, ) -> None: ... class BatchBytesItem: content: bytes mime_type: str config: FileExtractionConfig | None def __init__( self, content: bytes, mime_type: str, config: FileExtractionConfig | None = None, ) -> None: ... class BatchFileItem: path: str config: FileExtractionConfig | None def __init__(self, path: str, config: FileExtractionConfig | None = None) -> None: ... class ImageExtractionConfig: extract_images: bool target_dpi: int max_image_dimension: int inject_placeholders: bool auto_adjust_dpi: bool min_dpi: int max_dpi: int max_images_per_page: int | None classify: bool include_page_rasters: bool run_ocr_on_images: bool ocr_text_only: bool append_ocr_text: bool def __init__( self, extract_images: bool | None = None, target_dpi: int | None = None, max_image_dimension: int | None = None, inject_placeholders: bool | None = None, auto_adjust_dpi: bool | None = None, min_dpi: int | None = None, max_dpi: int | None = None, max_images_per_page: int | None = None, classify: bool | None = None, include_page_rasters: bool | None = None, run_ocr_on_images: bool | None = None, ocr_text_only: bool | None = None, append_ocr_text: bool | None = None, ) -> None: ... @staticmethod def default() -> ImageExtractionConfig: ... class TokenReductionOptions: mode: str preserve_important_words: bool def __init__( self, mode: str | None = None, preserve_important_words: bool | None = None, ) -> None: ... @staticmethod def default() -> TokenReductionOptions: ... class LanguageDetectionConfig: enabled: bool min_confidence: float detect_multiple: bool def __init__( self, enabled: bool | None = None, min_confidence: float | None = None, detect_multiple: bool | None = None, ) -> None: ... @staticmethod def default() -> LanguageDetectionConfig: ... class HtmlOutputConfig: css: str | None css_file: str | None theme: HtmlTheme class_prefix: str embed_css: bool def __init__( self, css: str | None = None, css_file: str | None = None, theme: HtmlTheme | str | None = None, class_prefix: str | None = None, embed_css: bool | None = None, ) -> None: ... @staticmethod def default() -> HtmlOutputConfig: ... class LayoutDetectionConfig: confidence_threshold: float | None apply_heuristics: bool table_model: TableModel acceleration: AccelerationConfig | None def __init__( self, confidence_threshold: float | None = None, apply_heuristics: bool | None = None, table_model: TableModel | str | None = None, acceleration: AccelerationConfig | None = None, ) -> None: ... @staticmethod def default() -> LayoutDetectionConfig: ... class LlmConfig: model: str api_key: str | None base_url: str | None timeout_secs: int | None max_retries: int | None temperature: float | None max_tokens: int | None def __init__( self, model: str | None = None, api_key: str | None = None, base_url: str | None = None, timeout_secs: int | None = None, max_retries: int | None = None, temperature: float | None = None, max_tokens: int | None = None, ) -> None: ... class StructuredExtractionConfig: schema: dict[str, Any] schema_name: str schema_description: str | None strict: bool prompt: str | None llm: LlmConfig def __init__( self, schema: dict[str, Any], schema_name: str, strict: bool, llm: LlmConfig, schema_description: str | None = None, prompt: str | None = None, ) -> None: ... class OcrQualityThresholds: min_total_non_whitespace: int min_non_whitespace_per_page: float min_meaningful_word_len: int min_meaningful_words: int min_alnum_ratio: float min_garbage_chars: int max_fragmented_word_ratio: float critical_fragmented_word_ratio: float min_avg_word_length: float min_words_for_avg_length_check: int min_consecutive_repeat_ratio: float min_words_for_repeat_check: int substantive_min_chars: int non_text_min_chars: int alnum_ws_ratio_threshold: float pipeline_min_quality: float def __init__( self, min_total_non_whitespace: int | None = None, min_non_whitespace_per_page: float | None = None, min_meaningful_word_len: int | None = None, min_meaningful_words: int | None = None, min_alnum_ratio: float | None = None, min_garbage_chars: int | None = None, max_fragmented_word_ratio: float | None = None, critical_fragmented_word_ratio: float | None = None, min_avg_word_length: float | None = None, min_words_for_avg_length_check: int | None = None, min_consecutive_repeat_ratio: float | None = None, min_words_for_repeat_check: int | None = None, substantive_min_chars: int | None = None, non_text_min_chars: int | None = None, alnum_ws_ratio_threshold: float | None = None, pipeline_min_quality: float | None = None, ) -> None: ... @staticmethod def default() -> OcrQualityThresholds: ... class OcrPipelineStage: backend: str priority: int language: str | None tesseract_config: TesseractConfig | None paddle_ocr_config: dict[str, Any] | None vlm_config: LlmConfig | None backend_options: dict[str, Any] | None def __init__( self, backend: str, priority: int, language: str | None = None, tesseract_config: TesseractConfig | None = None, paddle_ocr_config: dict[str, Any] | None = None, vlm_config: LlmConfig | None = None, backend_options: dict[str, Any] | None = None, ) -> None: ... class OcrPipelineConfig: stages: list[OcrPipelineStage] quality_thresholds: OcrQualityThresholds def __init__( self, stages: list[OcrPipelineStage], quality_thresholds: OcrQualityThresholds, ) -> None: ... class OcrConfig: enabled: bool backend: str language: str tesseract_config: TesseractConfig | None output_format: OutputFormat | None paddle_ocr_config: dict[str, Any] | None backend_options: dict[str, Any] | None element_config: OcrElementConfig | None quality_thresholds: OcrQualityThresholds | None pipeline: OcrPipelineConfig | None auto_rotate: bool vlm_config: LlmConfig | None vlm_prompt: str | None acceleration: AccelerationConfig | None tessdata_bytes: dict[str, bytes] | None def __init__( self, enabled: bool | None = None, backend: str | None = None, language: str | None = None, tesseract_config: TesseractConfig | None = None, output_format: OutputFormat | None = None, paddle_ocr_config: dict[str, Any] | None = None, backend_options: dict[str, Any] | None = None, element_config: OcrElementConfig | None = None, quality_thresholds: OcrQualityThresholds | None = None, pipeline: OcrPipelineConfig | None = None, auto_rotate: bool | None = None, vlm_config: LlmConfig | None = None, vlm_prompt: str | None = None, acceleration: AccelerationConfig | None = None, tessdata_bytes: dict[str, bytes] | None = None, ) -> None: ... @staticmethod def default() -> OcrConfig: ... class PageConfig: extract_pages: bool insert_page_markers: bool marker_format: str def __init__( self, extract_pages: bool | None = None, insert_page_markers: bool | None = None, marker_format: str | None = None, ) -> None: ... @staticmethod def default() -> PageConfig: ... class PdfConfig: extract_images: bool extract_tables: bool passwords: list[str] | None extract_metadata: bool hierarchy: HierarchyConfig | None extract_annotations: bool top_margin_fraction: float | None bottom_margin_fraction: float | None allow_single_column_tables: bool ocr_inline_images: bool def __init__( self, extract_images: bool | None = None, extract_tables: bool | None = None, passwords: list[str] | None = None, extract_metadata: bool | None = None, hierarchy: HierarchyConfig | None = None, extract_annotations: bool | None = None, top_margin_fraction: float | None = None, bottom_margin_fraction: float | None = None, allow_single_column_tables: bool | None = None, ocr_inline_images: bool | None = None, ) -> None: ... @staticmethod def default() -> PdfConfig: ... class HierarchyConfig: enabled: bool k_clusters: int include_bbox: bool ocr_coverage_threshold: float | None def __init__( self, enabled: bool | None = None, k_clusters: int | None = None, include_bbox: bool | None = None, ocr_coverage_threshold: float | None = None, ) -> None: ... @staticmethod def default() -> HierarchyConfig: ... class PostProcessorConfig: enabled: bool enabled_processors: list[str] | None disabled_processors: list[str] | None enabled_set: list[str] | None disabled_set: list[str] | None def __init__( self, enabled: bool | None = None, enabled_processors: list[str] | None = None, disabled_processors: list[str] | None = None, enabled_set: list[str] | None = None, disabled_set: list[str] | None = None, ) -> None: ... @staticmethod def default() -> PostProcessorConfig: ... class ChunkingConfig: max_characters: int overlap: int trim: bool chunker_type: ChunkerType embedding: EmbeddingConfig | None preset: str | None sizing: ChunkSizing prepend_heading_context: bool topic_threshold: float | None def __init__( self, max_characters: int | None = None, overlap: int | None = None, trim: bool | None = None, chunker_type: ChunkerType | str | None = None, embedding: EmbeddingConfig | None = None, preset: str | None = None, sizing: ChunkSizing | None = None, prepend_heading_context: bool | None = None, topic_threshold: float | None = None, ) -> None: ... @staticmethod def default() -> ChunkingConfig: ... class EmbeddingConfig: model: EmbeddingModelType normalize: bool batch_size: int show_download_progress: bool cache_dir: str | None acceleration: AccelerationConfig | None max_embed_duration_secs: int | None def __init__( self, model: EmbeddingModelType | None = None, normalize: bool | None = None, batch_size: int | None = None, show_download_progress: bool | None = None, cache_dir: str | None = None, acceleration: AccelerationConfig | None = None, max_embed_duration_secs: int | None = None, ) -> None: ... @staticmethod def default() -> EmbeddingConfig: ... class TreeSitterConfig: enabled: bool cache_dir: str | None languages: list[str] | None groups: list[str] | None process: TreeSitterProcessConfig def __init__( self, enabled: bool | None = None, cache_dir: str | None = None, languages: list[str] | None = None, groups: list[str] | None = None, process: TreeSitterProcessConfig | None = None, ) -> None: ... @staticmethod def default() -> TreeSitterConfig: ... class TreeSitterProcessConfig: structure: bool imports: bool exports: bool comments: bool docstrings: bool symbols: bool diagnostics: bool chunk_max_size: int | None content_mode: CodeContentMode def __init__( self, structure: bool | None = None, imports: bool | None = None, exports: bool | None = None, comments: bool | None = None, docstrings: bool | None = None, symbols: bool | None = None, diagnostics: bool | None = None, chunk_max_size: int | None = None, content_mode: CodeContentMode | str | None = None, ) -> None: ... @staticmethod def default() -> TreeSitterProcessConfig: ... class SupportedFormat: extension: str mime_type: str def __init__(self, extension: str, mime_type: str) -> None: ... class ServerConfig: host: str port: int cors_origins: list[str] max_request_body_bytes: int max_multipart_field_bytes: int def __init__( self, host: str | None = None, port: int | None = None, cors_origins: list[str] | None = None, max_request_body_bytes: int | None = None, max_multipart_field_bytes: int | None = None, ) -> None: ... def listen_addr(self) -> str: ... def cors_allows_all(self) -> bool: ... def is_origin_allowed(self, origin: str) -> bool: ... def max_request_body_mb(self) -> int: ... def max_multipart_field_mb(self) -> int: ... @staticmethod def default() -> ServerConfig: ... class StructuredDataResult: content: str format: str metadata: dict[str, str] text_fields: list[str] def __init__( self, content: str, format: str, # noqa: A002 metadata: dict[str, str], text_fields: list[str], ) -> None: ... class DocxAppProperties: application: str | None app_version: str | None template: str | None total_time: int | None pages: int | None words: int | None characters: int | None characters_with_spaces: int | None lines: int | None paragraphs: int | None company: str | None doc_security: int | None scale_crop: bool | None links_up_to_date: bool | None shared_doc: bool | None hyperlinks_changed: bool | None def __init__( self, application: str | None = None, app_version: str | None = None, template: str | None = None, total_time: int | None = None, pages: int | None = None, words: int | None = None, characters: int | None = None, characters_with_spaces: int | None = None, lines: int | None = None, paragraphs: int | None = None, company: str | None = None, doc_security: int | None = None, scale_crop: bool | None = None, links_up_to_date: bool | None = None, shared_doc: bool | None = None, hyperlinks_changed: bool | None = None, ) -> None: ... class XlsxAppProperties: application: str | None app_version: str | None doc_security: int | None scale_crop: bool | None links_up_to_date: bool | None shared_doc: bool | None hyperlinks_changed: bool | None company: str | None worksheet_names: list[str] def __init__( self, application: str | None = None, app_version: str | None = None, doc_security: int | None = None, scale_crop: bool | None = None, links_up_to_date: bool | None = None, shared_doc: bool | None = None, hyperlinks_changed: bool | None = None, company: str | None = None, worksheet_names: list[str] | None = None, ) -> None: ... class PptxAppProperties: application: str | None app_version: str | None total_time: int | None company: str | None doc_security: int | None scale_crop: bool | None links_up_to_date: bool | None shared_doc: bool | None hyperlinks_changed: bool | None slides: int | None notes: int | None hidden_slides: int | None multimedia_clips: int | None presentation_format: str | None slide_titles: list[str] def __init__( self, application: str | None = None, app_version: str | None = None, total_time: int | None = None, company: str | None = None, doc_security: int | None = None, scale_crop: bool | None = None, links_up_to_date: bool | None = None, shared_doc: bool | None = None, hyperlinks_changed: bool | None = None, slides: int | None = None, notes: int | None = None, hidden_slides: int | None = None, multimedia_clips: int | None = None, presentation_format: str | None = None, slide_titles: list[str] | None = None, ) -> None: ... class CoreProperties: title: str | None subject: str | None creator: str | None keywords: str | None description: str | None last_modified_by: str | None revision: str | None created: str | None modified: str | None category: str | None content_status: str | None language: str | None identifier: str | None version: str | None last_printed: str | None def __init__( self, title: str | None = None, subject: str | None = None, creator: str | None = None, keywords: str | None = None, description: str | None = None, last_modified_by: str | None = None, revision: str | None = None, created: str | None = None, modified: str | None = None, category: str | None = None, content_status: str | None = None, language: str | None = None, identifier: str | None = None, version: str | None = None, last_printed: str | None = None, ) -> None: ... class SecurityLimits: max_archive_size: int max_compression_ratio: int max_files_in_archive: int max_nesting_depth: int max_entity_length: int max_content_size: int max_iterations: int max_xml_depth: int max_table_cells: int def __init__( self, max_archive_size: int | None = None, max_compression_ratio: int | None = None, max_files_in_archive: int | None = None, max_nesting_depth: int | None = None, max_entity_length: int | None = None, max_content_size: int | None = None, max_iterations: int | None = None, max_xml_depth: int | None = None, max_table_cells: int | None = None, ) -> None: ... @staticmethod def default() -> SecurityLimits: ... class TokenReductionConfig: level: ReductionLevel language_hint: str | None preserve_markdown: bool preserve_code: bool semantic_threshold: float enable_parallel: bool use_simd: bool custom_stopwords: dict[str, list[str]] | None preserve_patterns: list[str] target_reduction: float | None enable_semantic_clustering: bool def __init__( self, level: ReductionLevel | str | None = None, language_hint: str | None = None, preserve_markdown: bool | None = None, preserve_code: bool | None = None, semantic_threshold: float | None = None, enable_parallel: bool | None = None, use_simd: bool | None = None, custom_stopwords: dict[str, list[str]] | None = None, preserve_patterns: list[str] | None = None, target_reduction: float | None = None, enable_semantic_clustering: bool | None = None, ) -> None: ... @staticmethod def default() -> TokenReductionConfig: ... class PdfAnnotation: annotation_type: PdfAnnotationType content: str | None page_number: int bounding_box: BoundingBox | None def __init__( self, annotation_type: PdfAnnotationType | str, page_number: int, content: str | None = None, bounding_box: BoundingBox | None = None, ) -> None: ... class DjotContent: plain_text: str blocks: list[FormattedBlock] metadata: Metadata tables: list[Table] images: list[DjotImage] links: list[DjotLink] footnotes: list[Footnote] attributes: list[str] def __init__( self, plain_text: str, blocks: list[FormattedBlock], metadata: Metadata, tables: list[Table], images: list[DjotImage], links: list[DjotLink], footnotes: list[Footnote], attributes: list[str], ) -> None: ... class FormattedBlock: block_type: BlockType level: int | None inline_content: list[InlineElement] attributes: str | None language: str | None code: str | None children: list[FormattedBlock] def __init__( self, block_type: BlockType | str, inline_content: list[InlineElement], children: list[FormattedBlock], level: int | None = None, attributes: str | None = None, language: str | None = None, code: str | None = None, ) -> None: ... class InlineElement: element_type: InlineType content: str attributes: str | None metadata: dict[str, str] | None def __init__( self, element_type: InlineType | str, content: str, attributes: str | None = None, metadata: dict[str, str] | None = None, ) -> None: ... class DjotImage: src: str alt: str title: str | None attributes: str | None def __init__( self, src: str, alt: str, title: str | None = None, attributes: str | None = None, ) -> None: ... class DjotLink: url: str text: str title: str | None attributes: str | None def __init__( self, url: str, text: str, title: str | None = None, attributes: str | None = None, ) -> None: ... class Footnote: label: str content: list[FormattedBlock] def __init__(self, label: str, content: list[FormattedBlock]) -> None: ... class DocumentStructure: nodes: list[DocumentNode] source_format: str | None relationships: list[DocumentRelationship] node_types: list[str] def __init__( self, nodes: list[DocumentNode] | None = None, source_format: str | None = None, relationships: list[DocumentRelationship] | None = None, node_types: list[str] | None = None, ) -> None: ... def finalize_node_types(self) -> None: ... def is_empty(self) -> bool: ... @staticmethod def default() -> DocumentStructure: ... class DocumentRelationship: source: int target: int kind: RelationshipKind def __init__(self, source: int, target: int, kind: RelationshipKind | str) -> None: ... class DocumentNode: id: str content: NodeContent parent: int | None children: list[int] content_layer: ContentLayer page: int | None page_end: int | None bbox: BoundingBox | None annotations: list[TextAnnotation] attributes: dict[str, str] | None def __init__( self, id: str, # noqa: A002 content: NodeContent, children: list[int], content_layer: ContentLayer | str, annotations: list[TextAnnotation], parent: int | None = None, page: int | None = None, page_end: int | None = None, bbox: BoundingBox | None = None, attributes: dict[str, str] | None = None, ) -> None: ... class TableGrid: rows: int cols: int cells: list[GridCell] def __init__( self, rows: int | None = None, cols: int | None = None, cells: list[GridCell] | None = None, ) -> None: ... class GridCell: content: str row: int col: int row_span: int col_span: int is_header: bool bbox: BoundingBox | None def __init__( self, content: str, row: int, col: int, row_span: int, col_span: int, is_header: bool, bbox: BoundingBox | None = None, ) -> None: ... class TextAnnotation: start: int end: int kind: AnnotationKind def __init__(self, start: int, end: int, kind: AnnotationKind) -> None: ... class ExtractionResult: content: str mime_type: str metadata: Metadata extraction_method: ExtractionMethod | None tables: list[Table] detected_languages: list[str] | None chunks: list[Chunk] | None images: list[ExtractedImage] | None pages: list[PageContent] | None elements: list[Element] | None djot_content: DjotContent | None ocr_elements: list[OcrElement] | None document: DocumentStructure | None extracted_keywords: list[Keyword] | None quality_score: float | None processing_warnings: list[ProcessingWarning] annotations: list[PdfAnnotation] | None children: list[ArchiveEntry] | None uris: list[ExtractedUri] | None revisions: list[DocumentRevision] | None structured_output: dict[str, Any] | None code_intelligence: dict[str, Any] | None llm_usage: list[LlmUsage] | None formatted_content: str | None ocr_internal_document: str | None def __init__( self, content: str | None = None, mime_type: str | None = None, metadata: Metadata | None = None, extraction_method: ExtractionMethod | str | None = None, tables: list[Table] | None = None, detected_languages: list[str] | None = None, chunks: list[Chunk] | None = None, images: list[ExtractedImage] | None = None, pages: list[PageContent] | None = None, elements: list[Element] | None = None, djot_content: DjotContent | None = None, ocr_elements: list[OcrElement] | None = None, document: DocumentStructure | None = None, quality_score: float | None = None, processing_warnings: list[ProcessingWarning] | None = None, annotations: list[PdfAnnotation] | None = None, children: list[ArchiveEntry] | None = None, uris: list[ExtractedUri] | None = None, revisions: list[DocumentRevision] | None = None, structured_output: dict[str, Any] | None = None, llm_usage: list[LlmUsage] | None = None, formatted_content: str | None = None, ocr_internal_document: str | None = None, ) -> None: ... @staticmethod def from_ocr(ocr: OcrExtractionResult) -> ExtractionResult: ... class ArchiveEntry: path: str mime_type: str result: ExtractionResult def __init__(self, path: str, mime_type: str, result: ExtractionResult) -> None: ... class ProcessingWarning: source: str message: str def __init__(self, source: str, message: str) -> None: ... class LlmUsage: model: str source: str input_tokens: int | None output_tokens: int | None total_tokens: int | None estimated_cost: float | None finish_reason: str | None def __init__( self, model: str | None = None, source: str | None = None, input_tokens: int | None = None, output_tokens: int | None = None, total_tokens: int | None = None, estimated_cost: float | None = None, finish_reason: str | None = None, ) -> None: ... class Chunk: content: str chunk_type: ChunkType embedding: list[float] | None metadata: ChunkMetadata def __init__( self, content: str, chunk_type: ChunkType | str, metadata: ChunkMetadata, embedding: list[float] | None = None, ) -> None: ... class HeadingContext: headings: list[HeadingLevel] def __init__(self, headings: list[HeadingLevel]) -> None: ... class HeadingLevel: level: int text: str def __init__(self, level: int, text: str) -> None: ... class ChunkMetadata: byte_start: int byte_end: int token_count: int | None chunk_index: int total_chunks: int first_page: int | None last_page: int | None heading_context: HeadingContext | None image_indices: list[int] def __init__( self, byte_start: int, byte_end: int, chunk_index: int, total_chunks: int, image_indices: list[int], token_count: int | None = None, first_page: int | None = None, last_page: int | None = None, heading_context: HeadingContext | None = None, ) -> None: ... class ExtractedImage: data: bytes format: str image_index: int page_number: int | None width: int | None height: int | None colorspace: str | None bits_per_component: int | None is_mask: bool description: str | None ocr_result: ExtractionResult | None bounding_box: BoundingBox | None source_path: str | None image_kind: ImageKind | None kind_confidence: float | None cluster_id: int | None def __init__( self, data: bytes, format: str, # noqa: A002 image_index: int, is_mask: bool, page_number: int | None = None, width: int | None = None, height: int | None = None, colorspace: str | None = None, bits_per_component: int | None = None, description: str | None = None, ocr_result: ExtractionResult | None = None, bounding_box: BoundingBox | None = None, source_path: str | None = None, image_kind: ImageKind | str | None = None, kind_confidence: float | None = None, cluster_id: int | None = None, ) -> None: ... class BoundingBox: x0: float y0: float x1: float y1: float def __init__( self, x0: float | None = None, y0: float | None = None, x1: float | None = None, y1: float | None = None, ) -> None: ... class ElementMetadata: page_number: int | None filename: str | None coordinates: BoundingBox | None element_index: int | None additional: dict[str, str] def __init__( self, additional: dict[str, str], page_number: int | None = None, filename: str | None = None, coordinates: BoundingBox | None = None, element_index: int | None = None, ) -> None: ... class Element: element_id: str element_type: ElementType text: str metadata: ElementMetadata def __init__( self, element_id: str, element_type: ElementType | str, text: str, metadata: ElementMetadata, ) -> None: ... class ExcelWorkbook: sheets: list[ExcelSheet] metadata: dict[str, str] revisions: list[DocumentRevision] | None def __init__( self, sheets: list[ExcelSheet], metadata: dict[str, str], revisions: list[DocumentRevision] | None = None, ) -> None: ... class ExcelSheet: name: str markdown: str row_count: int col_count: int cell_count: int table_cells: list[list[str]] | None def __init__( self, name: str, markdown: str, row_count: int, col_count: int, cell_count: int, table_cells: list[list[str]] | None = None, ) -> None: ... class XmlExtractionResult: content: str element_count: int unique_elements: list[str] def __init__(self, content: str, element_count: int, unique_elements: list[str]) -> None: ... class TextExtractionResult: content: str line_count: int word_count: int character_count: int headers: list[str] | None links: list[list[str]] | None code_blocks: list[list[str]] | None def __init__( self, content: str, line_count: int, word_count: int, character_count: int, headers: list[str] | None = None, links: list[list[str]] | None = None, code_blocks: list[list[str]] | None = None, ) -> None: ... class PptxExtractionResult: content: str metadata: PptxMetadata slide_count: int image_count: int table_count: int images: list[ExtractedImage] page_structure: PageStructure | None page_contents: list[PageContent] | None document: DocumentStructure | None hyperlinks: list[str] office_metadata: dict[str, str] revisions: list[DocumentRevision] | None def __init__( self, content: str, metadata: PptxMetadata, slide_count: int, image_count: int, table_count: int, images: list[ExtractedImage], hyperlinks: list[str], office_metadata: dict[str, str], page_structure: PageStructure | None = None, page_contents: list[PageContent] | None = None, document: DocumentStructure | None = None, revisions: list[DocumentRevision] | None = None, ) -> None: ... class EmailExtractionResult: subject: str | None from_email: str | None to_emails: list[str] cc_emails: list[str] bcc_emails: list[str] date: str | None message_id: str | None plain_text: str | None html_content: str | None content: str attachments: list[EmailAttachment] metadata: dict[str, str] def __init__( self, to_emails: list[str], cc_emails: list[str], bcc_emails: list[str], content: str, attachments: list[EmailAttachment], metadata: dict[str, str], subject: str | None = None, from_email: str | None = None, date: str | None = None, message_id: str | None = None, plain_text: str | None = None, html_content: str | None = None, ) -> None: ... class EmailAttachment: name: str | None filename: str | None mime_type: str | None size: int | None is_image: bool data: bytes | None def __init__( self, is_image: bool, name: str | None = None, filename: str | None = None, mime_type: str | None = None, size: int | None = None, data: bytes | None = None, ) -> None: ... class OcrExtractionResult: content: str mime_type: str metadata: dict[str, dict[str, Any]] tables: list[OcrTable] ocr_elements: list[OcrElement] | None internal_document: str | None def __init__( self, content: str, mime_type: str, metadata: dict[str, dict[str, Any]], tables: list[OcrTable], ocr_elements: list[OcrElement] | None = None, internal_document: str | None = None, ) -> None: ... class OcrTable: cells: list[list[str]] markdown: str page_number: int bounding_box: OcrTableBoundingBox | None def __init__( self, cells: list[list[str]], markdown: str, page_number: int, bounding_box: OcrTableBoundingBox | None = None, ) -> None: ... class OcrTableBoundingBox: left: int top: int right: int bottom: int def __init__(self, left: int, top: int, right: int, bottom: int) -> None: ... class ImagePreprocessingConfig: target_dpi: int auto_rotate: bool deskew: bool denoise: bool contrast_enhance: bool binarization_method: str invert_colors: bool def __init__( self, target_dpi: int | None = None, auto_rotate: bool | None = None, deskew: bool | None = None, denoise: bool | None = None, contrast_enhance: bool | None = None, binarization_method: str | None = None, invert_colors: bool | None = None, ) -> None: ... @staticmethod def default() -> ImagePreprocessingConfig: ... class TesseractConfig: language: str psm: int output_format: str oem: int min_confidence: float preprocessing: ImagePreprocessingConfig | None enable_table_detection: bool table_min_confidence: float table_column_threshold: int table_row_threshold_ratio: float use_cache: bool classify_use_pre_adapted_templates: bool language_model_ngram_on: bool tessedit_dont_blkrej_good_wds: bool tessedit_dont_rowrej_good_wds: bool tessedit_enable_dict_correction: bool tessedit_char_whitelist: str tessedit_char_blacklist: str tessedit_use_primary_params_model: bool textord_space_size_is_variable: bool thresholding_method: bool def __init__( self, language: str | None = None, psm: int | None = None, output_format: str | None = None, oem: int | None = None, min_confidence: float | None = None, preprocessing: ImagePreprocessingConfig | None = None, enable_table_detection: bool | None = None, table_min_confidence: float | None = None, table_column_threshold: int | None = None, table_row_threshold_ratio: float | None = None, use_cache: bool | None = None, classify_use_pre_adapted_templates: bool | None = None, language_model_ngram_on: bool | None = None, tessedit_dont_blkrej_good_wds: bool | None = None, tessedit_dont_rowrej_good_wds: bool | None = None, tessedit_enable_dict_correction: bool | None = None, tessedit_char_whitelist: str | None = None, tessedit_char_blacklist: str | None = None, tessedit_use_primary_params_model: bool | None = None, textord_space_size_is_variable: bool | None = None, thresholding_method: bool | None = None, ) -> None: ... @staticmethod def default() -> TesseractConfig: ... class ImagePreprocessingMetadata: original_dimensions: list[int] original_dpi: list[float] target_dpi: int scale_factor: float auto_adjusted: bool final_dpi: int new_dimensions: list[int] | None resample_method: str dimension_clamped: bool calculated_dpi: int | None skipped_resize: bool resize_error: str | None def __init__( self, original_dimensions: list[int], original_dpi: list[float], target_dpi: int, scale_factor: float, auto_adjusted: bool, final_dpi: int, resample_method: str, dimension_clamped: bool, skipped_resize: bool, new_dimensions: list[int] | None = None, calculated_dpi: int | None = None, resize_error: str | None = None, ) -> None: ... class Metadata: title: str | None subject: str | None authors: list[str] | None keywords: list[str] | None language: str | None created_at: str | None modified_at: str | None created_by: str | None modified_by: str | None pages: PageStructure | None format: FormatMetadata | None image_preprocessing: ImagePreprocessingMetadata | None json_schema: dict[str, Any] | None error: ErrorMetadata | None extraction_duration_ms: int | None category: str | None tags: list[str] | None document_version: str | None abstract_text: str | None output_format: str | None ocr_used: bool additional: dict[str, dict[str, Any]] def __init__( self, title: str | None = None, subject: str | None = None, authors: list[str] | None = None, keywords: list[str] | None = None, language: str | None = None, created_at: str | None = None, modified_at: str | None = None, created_by: str | None = None, modified_by: str | None = None, pages: PageStructure | None = None, format: FormatMetadata | None = None, # noqa: A002 image_preprocessing: ImagePreprocessingMetadata | None = None, json_schema: dict[str, Any] | None = None, error: ErrorMetadata | None = None, extraction_duration_ms: int | None = None, category: str | None = None, tags: list[str] | None = None, document_version: str | None = None, abstract_text: str | None = None, output_format: str | None = None, ocr_used: bool | None = None, additional: dict[str, dict[str, Any]] | None = None, ) -> None: ... def is_empty(self) -> bool: ... class ExcelMetadata: sheet_count: int | None sheet_names: list[str] | None def __init__( self, sheet_count: int | None = None, sheet_names: list[str] | None = None, ) -> None: ... class EmailMetadata: from_email: str | None from_name: str | None to_emails: list[str] cc_emails: list[str] bcc_emails: list[str] message_id: str | None attachments: list[str] def __init__( self, from_email: str | None = None, from_name: str | None = None, to_emails: list[str] | None = None, cc_emails: list[str] | None = None, bcc_emails: list[str] | None = None, message_id: str | None = None, attachments: list[str] | None = None, ) -> None: ... class ArchiveMetadata: format: str file_count: int file_list: list[str] total_size: int compressed_size: int | None def __init__( self, format: str | None = None, # noqa: A002 file_count: int | None = None, file_list: list[str] | None = None, total_size: int | None = None, compressed_size: int | None = None, ) -> None: ... class ImageMetadata: width: int height: int format: str exif: dict[str, str] def __init__( self, width: int | None = None, height: int | None = None, format: str | None = None, # noqa: A002 exif: dict[str, str] | None = None, ) -> None: ... class XmlMetadata: element_count: int unique_elements: list[str] def __init__( self, element_count: int | None = None, unique_elements: list[str] | None = None, ) -> None: ... class TextMetadata: line_count: int word_count: int character_count: int headers: list[str] | None links: list[list[str]] | None code_blocks: list[list[str]] | None def __init__( self, line_count: int | None = None, word_count: int | None = None, character_count: int | None = None, headers: list[str] | None = None, links: list[list[str]] | None = None, code_blocks: list[list[str]] | None = None, ) -> None: ... class HeaderMetadata: level: int text: str id: str | None depth: int html_offset: int def __init__( self, level: int, text: str, depth: int, html_offset: int, id: str | None = None, # noqa: A002 ) -> None: ... class LinkMetadata: href: str text: str title: str | None link_type: LinkType rel: list[str] attributes: list[list[str]] def __init__( self, href: str, text: str, link_type: LinkType | str, rel: list[str], attributes: list[list[str]], title: str | None = None, ) -> None: ... class ImageMetadataType: src: str alt: str | None title: str | None dimensions: list[int] | None image_type: ImageType attributes: list[list[str]] def __init__( self, src: str, image_type: ImageType | str, attributes: list[list[str]], alt: str | None = None, title: str | None = None, dimensions: list[int] | None = None, ) -> None: ... class StructuredData: data_type: StructuredDataType raw_json: str schema_type: str | None def __init__( self, data_type: StructuredDataType | str, raw_json: str, schema_type: str | None = None, ) -> None: ... class HtmlMetadata: title: str | None description: str | None keywords: list[str] author: str | None canonical_url: str | None base_href: str | None language: str | None text_direction: TextDirection | None open_graph: dict[str, str] twitter_card: dict[str, str] meta_tags: dict[str, str] headers: list[HeaderMetadata] links: list[LinkMetadata] images: list[ImageMetadataType] structured_data: list[StructuredData] def __init__( self, title: str | None = None, description: str | None = None, keywords: list[str] | None = None, author: str | None = None, canonical_url: str | None = None, base_href: str | None = None, language: str | None = None, text_direction: TextDirection | str | None = None, open_graph: dict[str, str] | None = None, twitter_card: dict[str, str] | None = None, meta_tags: dict[str, str] | None = None, headers: list[HeaderMetadata] | None = None, links: list[LinkMetadata] | None = None, images: list[ImageMetadataType] | None = None, structured_data: list[StructuredData] | None = None, ) -> None: ... class OcrMetadata: language: str psm: int output_format: str table_count: int table_rows: int | None table_cols: int | None def __init__( self, language: str | None = None, psm: int | None = None, output_format: str | None = None, table_count: int | None = None, table_rows: int | None = None, table_cols: int | None = None, ) -> None: ... class ErrorMetadata: error_type: str message: str def __init__(self, error_type: str, message: str) -> None: ... class PptxMetadata: slide_count: int slide_names: list[str] image_count: int | None table_count: int | None def __init__( self, slide_count: int | None = None, slide_names: list[str] | None = None, image_count: int | None = None, table_count: int | None = None, ) -> None: ... class DocxMetadata: core_properties: CoreProperties | None app_properties: DocxAppProperties | None custom_properties: dict[str, dict[str, Any]] | None def __init__( self, core_properties: CoreProperties | None = None, app_properties: DocxAppProperties | None = None, custom_properties: dict[str, dict[str, Any]] | None = None, ) -> None: ... class CsvMetadata: row_count: int column_count: int delimiter: str | None has_header: bool column_types: list[str] | None def __init__( self, row_count: int | None = None, column_count: int | None = None, delimiter: str | None = None, has_header: bool | None = None, column_types: list[str] | None = None, ) -> None: ... class BibtexMetadata: entry_count: int citation_keys: list[str] authors: list[str] year_range: YearRange | None entry_types: dict[str, int] | None def __init__( self, entry_count: int | None = None, citation_keys: list[str] | None = None, authors: list[str] | None = None, year_range: YearRange | None = None, entry_types: dict[str, int] | None = None, ) -> None: ... class CitationMetadata: citation_count: int format: str | None authors: list[str] year_range: YearRange | None dois: list[str] keywords: list[str] def __init__( self, citation_count: int | None = None, format: str | None = None, # noqa: A002 authors: list[str] | None = None, year_range: YearRange | None = None, dois: list[str] | None = None, keywords: list[str] | None = None, ) -> None: ... class YearRange: min: int | None max: int | None years: list[int] def __init__( self, years: list[int], min: int | None = None, # noqa: A002 max: int | None = None, # noqa: A002 ) -> None: ... class FictionBookMetadata: genres: list[str] sequences: list[str] annotation: str | None def __init__( self, genres: list[str] | None = None, sequences: list[str] | None = None, annotation: str | None = None, ) -> None: ... class DbfMetadata: record_count: int field_count: int fields: list[DbfFieldInfo] def __init__( self, record_count: int | None = None, field_count: int | None = None, fields: list[DbfFieldInfo] | None = None, ) -> None: ... class DbfFieldInfo: name: str field_type: str def __init__(self, name: str, field_type: str) -> None: ... class JatsMetadata: copyright: str | None license: str | None history_dates: dict[str, str] contributor_roles: list[ContributorRole] def __init__( self, copyright: str | None = None, # noqa: A002 license: str | None = None, # noqa: A002 history_dates: dict[str, str] | None = None, contributor_roles: list[ContributorRole] | None = None, ) -> None: ... class ContributorRole: name: str role: str | None def __init__(self, name: str, role: str | None = None) -> None: ... class EpubMetadata: coverage: str | None dc_format: str | None relation: str | None source: str | None dc_type: str | None cover_image: str | None def __init__( self, coverage: str | None = None, dc_format: str | None = None, relation: str | None = None, source: str | None = None, dc_type: str | None = None, cover_image: str | None = None, ) -> None: ... class PstMetadata: message_count: int def __init__(self, message_count: int | None = None) -> None: ... class OcrConfidence: detection: float | None recognition: float def __init__( self, detection: float | None = None, recognition: float | None = None, ) -> None: ... class OcrRotation: angle_degrees: float confidence: float | None def __init__(self, angle_degrees: float, confidence: float | None = None) -> None: ... class OcrElement: text: str geometry: OcrBoundingGeometry confidence: OcrConfidence level: OcrElementLevel rotation: OcrRotation | None page_number: int parent_id: str | None backend_metadata: dict[str, dict[str, Any]] def __init__( self, text: str | None = None, geometry: OcrBoundingGeometry | None = None, confidence: OcrConfidence | None = None, level: OcrElementLevel | str | None = None, rotation: OcrRotation | None = None, page_number: int | None = None, parent_id: str | None = None, backend_metadata: dict[str, dict[str, Any]] | None = None, ) -> None: ... class OcrElementConfig: include_elements: bool min_level: OcrElementLevel min_confidence: float build_hierarchy: bool def __init__( self, include_elements: bool | None = None, min_level: OcrElementLevel | str | None = None, min_confidence: float | None = None, build_hierarchy: bool | None = None, ) -> None: ... class PageStructure: total_count: int unit_type: PageUnitType boundaries: list[PageBoundary] | None pages: list[PageInfo] | None def __init__( self, total_count: int, unit_type: PageUnitType | str, boundaries: list[PageBoundary] | None = None, pages: list[PageInfo] | None = None, ) -> None: ... class PageBoundary: byte_start: int byte_end: int page_number: int def __init__(self, byte_start: int, byte_end: int, page_number: int) -> None: ... class PageInfo: number: int title: str | None dimensions: list[float] | None image_count: int | None table_count: int | None hidden: bool | None is_blank: bool | None has_vector_graphics: bool def __init__( self, number: int, has_vector_graphics: bool, title: str | None = None, dimensions: list[float] | None = None, image_count: int | None = None, table_count: int | None = None, hidden: bool | None = None, is_blank: bool | None = None, ) -> None: ... class PageContent: page_number: int content: str tables: list[Table] image_indices: list[int] hierarchy: PageHierarchy | None is_blank: bool | None layout_regions: list[LayoutRegion] | None speaker_notes: str | None section_name: str | None sheet_name: str | None def __init__( self, page_number: int, content: str, tables: list[Table], image_indices: list[int], hierarchy: PageHierarchy | None = None, is_blank: bool | None = None, layout_regions: list[LayoutRegion] | None = None, speaker_notes: str | None = None, section_name: str | None = None, sheet_name: str | None = None, ) -> None: ... class LayoutRegion: class_name: str confidence: float bounding_box: BoundingBox area_fraction: float def __init__( self, class_name: str | None = None, confidence: float | None = None, bounding_box: BoundingBox | None = None, area_fraction: float | None = None, ) -> None: ... class PageHierarchy: block_count: int blocks: list[HierarchicalBlock] def __init__(self, block_count: int, blocks: list[HierarchicalBlock]) -> None: ... class HierarchicalBlock: text: str font_size: float level: str bbox: list[float] | None def __init__( self, text: str, font_size: float, level: str, bbox: list[float] | None = None, ) -> None: ... class CellChange: row: int col: int from_: str to: str def __init__(self, row: int, col: int, from_: str, to: str) -> None: ... class DocumentRevision: revision_id: str author: str | None timestamp: str | None kind: RevisionKind anchor: RevisionAnchor | None delta: RevisionDelta def __init__( self, revision_id: str, kind: RevisionKind | str, delta: RevisionDelta, author: str | None = None, timestamp: str | None = None, anchor: RevisionAnchor | None = None, ) -> None: ... class RevisionDelta: content: list[DiffLine] table_changes: list[CellChange] def __init__( self, content: list[DiffLine] | None = None, table_changes: list[CellChange] | None = None, ) -> None: ... class Table: cells: list[list[str]] markdown: str page_number: int bounding_box: BoundingBox | None def __init__( self, cells: list[list[str]] | None = None, markdown: str | None = None, page_number: int | None = None, bounding_box: BoundingBox | None = None, ) -> None: ... class TableCell: content: str row_span: int col_span: int is_header: bool def __init__( self, content: str | None = None, row_span: int | None = None, col_span: int | None = None, is_header: bool | None = None, ) -> None: ... class ExtractedUri: url: str label: str | None page: int | None kind: UriKind def __init__( self, url: str, kind: UriKind | str, label: str | None = None, page: int | None = None, ) -> None: ... class DetectResponse: mime_type: str filename: str | None def __init__(self, mime_type: str, filename: str | None = None) -> None: ... class DiffOptions: include_metadata: bool include_embedded: bool max_content_chars: int | None def __init__( self, include_metadata: bool | None = None, include_embedded: bool | None = None, max_content_chars: int | None = None, ) -> None: ... @staticmethod def default() -> DiffOptions: ... class ExtractionDiff: content_diff: list[DiffHunk] tables_added: list[Table] tables_removed: list[Table] tables_changed: list[TableDiff] metadata_changed: dict[str, Any] embedded_changes: EmbeddedChanges def __init__( self, content_diff: list[DiffHunk], tables_added: list[Table], tables_removed: list[Table], tables_changed: list[TableDiff], metadata_changed: dict[str, Any], embedded_changes: EmbeddedChanges, ) -> None: ... class DiffHunk: from_line: int from_count: int to_line: int to_count: int lines: list[DiffLine] def __init__( self, from_line: int, from_count: int, to_line: int, to_count: int, lines: list[DiffLine], ) -> None: ... class TableDiff: from_index: int to_index: int cell_changes: list[CellChange] def __init__(self, from_index: int, to_index: int, cell_changes: list[CellChange]) -> None: ... class EmbeddedChanges: added: list[ArchiveEntry] removed: list[ArchiveEntry] changed: list[EmbeddedDiff] def __init__( self, added: list[ArchiveEntry], removed: list[ArchiveEntry], changed: list[EmbeddedDiff], ) -> None: ... class EmbeddedDiff: path: str diff: ExtractionDiff def __init__(self, path: str, diff: ExtractionDiff) -> None: ... class EmbeddingPreset: name: str chunk_size: int overlap: int model_repo: str pooling: str model_file: str dimensions: int description: str def __init__( self, name: str, chunk_size: int, overlap: int, model_repo: str, pooling: str, model_file: str, dimensions: int, description: str, ) -> None: ... class YakeParams: window_size: int def __init__(self, window_size: int | None = None) -> None: ... @staticmethod def default() -> YakeParams: ... class RakeParams: min_word_length: int max_words_per_phrase: int def __init__( self, min_word_length: int | None = None, max_words_per_phrase: int | None = None, ) -> None: ... @staticmethod def default() -> RakeParams: ... class KeywordConfig: algorithm: KeywordAlgorithm max_keywords: int min_score: float ngram_range: list[int] language: str | None yake_params: YakeParams | None rake_params: RakeParams | None def __init__( self, algorithm: KeywordAlgorithm | str | None = None, max_keywords: int | None = None, min_score: float | None = None, ngram_range: list[int] | None = None, language: str | None = None, ) -> None: ... @staticmethod def default() -> KeywordConfig: ... class Keyword: text: str score: float algorithm: KeywordAlgorithm positions: list[int] | None def __init__( self, text: str, score: float, algorithm: KeywordAlgorithm | str, positions: list[int] | None = None, ) -> None: ... class PaddleOcrConfig: language: str cache_dir: str | None use_angle_cls: bool enable_table_detection: bool det_db_thresh: float det_db_box_thresh: float det_db_unclip_ratio: float det_limit_side_len: int rec_batch_num: int padding: int drop_score: float model_tier: str def __init__( self, language: str | None = None, cache_dir: str | None = None, use_angle_cls: bool | None = None, enable_table_detection: bool | None = None, det_db_thresh: float | None = None, det_db_box_thresh: float | None = None, det_db_unclip_ratio: float | None = None, det_limit_side_len: int | None = None, rec_batch_num: int | None = None, padding: int | None = None, drop_score: float | None = None, model_tier: str | None = None, ) -> None: ... def with_cache_dir(self, path: str) -> PaddleOcrConfig: ... def with_table_detection(self, enable: bool) -> PaddleOcrConfig: ... def with_angle_cls(self, enable: bool) -> PaddleOcrConfig: ... def with_det_db_thresh(self, threshold: float) -> PaddleOcrConfig: ... def with_det_db_box_thresh(self, threshold: float) -> PaddleOcrConfig: ... def with_det_db_unclip_ratio(self, ratio: float) -> PaddleOcrConfig: ... def with_det_limit_side_len(self, length: int) -> PaddleOcrConfig: ... def with_rec_batch_num(self, batch_size: int) -> PaddleOcrConfig: ... def with_drop_score(self, score: float) -> PaddleOcrConfig: ... def with_padding(self, padding: int) -> PaddleOcrConfig: ... def with_model_tier(self, tier: str) -> PaddleOcrConfig: ... @staticmethod def default() -> PaddleOcrConfig: ... class ModelPaths: det_model: str cls_model: str rec_model: str dict_file: str def __init__(self, det_model: str, cls_model: str, rec_model: str, dict_file: str) -> None: ... class OrientationResult: degrees: int confidence: float def __init__(self, degrees: int, confidence: float) -> None: ... class BBox: x1: float y1: float x2: float y2: float def __init__(self, x1: float, y1: float, x2: float, y2: float) -> None: ... class LayoutDetection: class_name: LayoutClass confidence: float bbox: BBox def __init__(self, class_name: LayoutClass | str, confidence: float, bbox: BBox) -> None: ... class RecognizedTable: detection_bbox: BBox cells: list[list[str]] markdown: str def __init__(self, detection_bbox: BBox, cells: list[list[str]], markdown: str) -> None: ... class DetectionResult: page_width: int page_height: int detections: list[LayoutDetection] def __init__( self, page_width: int, page_height: int, detections: list[LayoutDetection], ) -> None: ... class EmbeddedFile: name: str data: bytes compressed_size: int mime_type: str | None def __init__( self, name: str, data: bytes, compressed_size: int, mime_type: str | None = None, ) -> None: ... class PdfMetadata: pdf_version: str | None producer: str | None is_encrypted: bool | None width: int | None height: int | None page_count: int | None def __init__( self, pdf_version: str | None = None, producer: str | None = None, is_encrypted: bool | None = None, width: int | None = None, height: int | None = None, page_count: int | None = None, ) -> None: ... class ExecutionProviderType: auto: ExecutionProviderType = ... cpu: ExecutionProviderType = ... core_ml: ExecutionProviderType = ... cuda: ExecutionProviderType = ... tensor_rt: ExecutionProviderType = ... def __init__(self, value: int | str) -> None: ... class OutputFormatPlainVariant(TypedDict): type: Literal["plain"] class OutputFormatMarkdownVariant(TypedDict): type: Literal["markdown"] class OutputFormatDjotVariant(TypedDict): type: Literal["djot"] class OutputFormatHtmlVariant(TypedDict): type: Literal["html"] class OutputFormatJsonVariant(TypedDict): type: Literal["json"] class OutputFormatStructuredVariant(TypedDict): type: Literal["structured"] class OutputFormatCustomVariant(TypedDict): type: Literal["custom"] _0: str class OutputFormat: type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class HtmlTheme: default: HtmlTheme = ... git_hub: HtmlTheme = ... dark: HtmlTheme = ... light: HtmlTheme = ... unstyled: HtmlTheme = ... def __init__(self, value: int | str) -> None: ... class TableModel: tatr: TableModel = ... slanet_wired: TableModel = ... slanet_wireless: TableModel = ... slanet_plus: TableModel = ... slanet_auto: TableModel = ... disabled: TableModel = ... def __init__(self, value: int | str) -> None: ... class ChunkerType: text: ChunkerType = ... markdown: ChunkerType = ... yaml: ChunkerType = ... semantic: ChunkerType = ... def __init__(self, value: int | str) -> None: ... class ChunkSizingCharactersVariant(TypedDict): type: Literal["characters"] class ChunkSizingTokenizerVariant(TypedDict): type: Literal["tokenizer"] model: str cache_dir: str | None class ChunkSizing: type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class EmbeddingModelTypePresetVariant(TypedDict): type: Literal["preset"] name: str class EmbeddingModelTypeCustomVariant(TypedDict): type: Literal["custom"] model_id: str dimensions: int class EmbeddingModelTypeLlmVariant(TypedDict): type: Literal["llm"] llm: LlmConfig class EmbeddingModelTypePluginVariant(TypedDict): type: Literal["plugin"] name: str class EmbeddingModelType: type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class CodeContentMode: chunks: CodeContentMode = ... raw: CodeContentMode = ... structure: CodeContentMode = ... def __init__(self, value: int | str) -> None: ... class ListType: bullet: ListType = ... numbered: ListType = ... lettered: ListType = ... indented: ListType = ... def __init__(self, value: int | str) -> None: ... class OcrBackendType: tesseract: OcrBackendType = ... easy_ocr: OcrBackendType = ... paddle_ocr: OcrBackendType = ... custom: OcrBackendType = ... def __init__(self, value: int | str) -> None: ... class ProcessingStage: early: ProcessingStage = ... middle: ProcessingStage = ... late: ProcessingStage = ... def __init__(self, value: int | str) -> None: ... class ReductionLevel: off: ReductionLevel = ... light: ReductionLevel = ... moderate: ReductionLevel = ... aggressive: ReductionLevel = ... maximum: ReductionLevel = ... def __init__(self, value: int | str) -> None: ... class PdfAnnotationType: text: PdfAnnotationType = ... highlight: PdfAnnotationType = ... link: PdfAnnotationType = ... stamp: PdfAnnotationType = ... underline: PdfAnnotationType = ... strike_out: PdfAnnotationType = ... other: PdfAnnotationType = ... def __init__(self, value: int | str) -> None: ... class BlockType: paragraph: BlockType = ... heading: BlockType = ... blockquote: BlockType = ... code_block: BlockType = ... list_item: BlockType = ... ordered_list: BlockType = ... bullet_list: BlockType = ... task_list: BlockType = ... definition_list: BlockType = ... definition_term: BlockType = ... definition_description: BlockType = ... div: BlockType = ... section: BlockType = ... thematic_break: BlockType = ... raw_block: BlockType = ... math_display: BlockType = ... def __init__(self, value: int | str) -> None: ... class InlineType: text: InlineType = ... strong: InlineType = ... emphasis: InlineType = ... highlight: InlineType = ... subscript: InlineType = ... superscript: InlineType = ... insert: InlineType = ... delete: InlineType = ... code: InlineType = ... link: InlineType = ... image: InlineType = ... span: InlineType = ... math: InlineType = ... raw_inline: InlineType = ... footnote_ref: InlineType = ... symbol: InlineType = ... def __init__(self, value: int | str) -> None: ... class RelationshipKind: footnote_reference: RelationshipKind = ... citation_reference: RelationshipKind = ... internal_link: RelationshipKind = ... caption: RelationshipKind = ... label: RelationshipKind = ... toc_entry: RelationshipKind = ... cross_reference: RelationshipKind = ... def __init__(self, value: int | str) -> None: ... class ContentLayer: body: ContentLayer = ... header: ContentLayer = ... footer: ContentLayer = ... footnote: ContentLayer = ... def __init__(self, value: int | str) -> None: ... class NodeContentTitleVariant(TypedDict): node_type: Literal["title"] text: str class NodeContentHeadingVariant(TypedDict): node_type: Literal["heading"] level: int text: str class NodeContentParagraphVariant(TypedDict): node_type: Literal["paragraph"] text: str class NodeContentListVariant(TypedDict): node_type: Literal["list"] ordered: bool class NodeContentListItemVariant(TypedDict): node_type: Literal["list_item"] text: str class NodeContentTableVariant(TypedDict): node_type: Literal["table"] grid: TableGrid class NodeContentImageVariant(TypedDict): node_type: Literal["image"] description: str | None image_index: int | None src: str | None class NodeContentCodeVariant(TypedDict): node_type: Literal["code"] text: str language: str | None class NodeContentQuoteVariant(TypedDict): node_type: Literal["quote"] class NodeContentFormulaVariant(TypedDict): node_type: Literal["formula"] text: str class NodeContentFootnoteVariant(TypedDict): node_type: Literal["footnote"] text: str class NodeContentGroupVariant(TypedDict): node_type: Literal["group"] label: str | None heading_level: int | None heading_text: str | None class NodeContentPageBreakVariant(TypedDict): node_type: Literal["page_break"] class NodeContentSlideVariant(TypedDict): node_type: Literal["slide"] number: int title: str | None class NodeContentDefinitionListVariant(TypedDict): node_type: Literal["definition_list"] class NodeContentDefinitionItemVariant(TypedDict): node_type: Literal["definition_item"] term: str definition: str class NodeContentCitationVariant(TypedDict): node_type: Literal["citation"] key: str text: str class NodeContentAdmonitionVariant(TypedDict): node_type: Literal["admonition"] kind: str title: str | None class NodeContentRawBlockVariant(TypedDict): node_type: Literal["raw_block"] format: str content: str class NodeContentMetadataBlockVariant(TypedDict): node_type: Literal["metadata_block"] entries: list[list[str]] class NodeContent: node_type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class AnnotationKindBoldVariant(TypedDict): annotation_type: Literal["bold"] class AnnotationKindItalicVariant(TypedDict): annotation_type: Literal["italic"] class AnnotationKindUnderlineVariant(TypedDict): annotation_type: Literal["underline"] class AnnotationKindStrikethroughVariant(TypedDict): annotation_type: Literal["strikethrough"] class AnnotationKindCodeVariant(TypedDict): annotation_type: Literal["code"] class AnnotationKindSubscriptVariant(TypedDict): annotation_type: Literal["subscript"] class AnnotationKindSuperscriptVariant(TypedDict): annotation_type: Literal["superscript"] class AnnotationKindLinkVariant(TypedDict): annotation_type: Literal["link"] url: str title: str | None class AnnotationKindHighlightVariant(TypedDict): annotation_type: Literal["highlight"] class AnnotationKindColorVariant(TypedDict): annotation_type: Literal["color"] value: str class AnnotationKindFontSizeVariant(TypedDict): annotation_type: Literal["font_size"] value: str class AnnotationKindCustomVariant(TypedDict): annotation_type: Literal["custom"] name: str value: str | None class AnnotationKind: annotation_type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class ExtractionMethod: native: ExtractionMethod = ... ocr: ExtractionMethod = ... mixed: ExtractionMethod = ... def __init__(self, value: int | str) -> None: ... class ChunkType: heading: ChunkType = ... party_list: ChunkType = ... definitions: ChunkType = ... operative_clause: ChunkType = ... signature_block: ChunkType = ... schedule: ChunkType = ... table_like: ChunkType = ... formula: ChunkType = ... code_block: ChunkType = ... image: ChunkType = ... org_chart: ChunkType = ... diagram: ChunkType = ... unknown: ChunkType = ... def __init__(self, value: int | str) -> None: ... class ImageKind: photograph: ImageKind = ... diagram: ImageKind = ... chart: ImageKind = ... drawing: ImageKind = ... text_block: ImageKind = ... decoration: ImageKind = ... logo: ImageKind = ... icon: ImageKind = ... tile_fragment: ImageKind = ... mask: ImageKind = ... page_raster: ImageKind = ... unknown: ImageKind = ... def __init__(self, value: int | str) -> None: ... class ResultFormat: unified: ResultFormat = ... element_based: ResultFormat = ... def __init__(self, value: int | str) -> None: ... class ElementType: title_: ElementType = ... narrative_text: ElementType = ... heading: ElementType = ... list_item: ElementType = ... table: ElementType = ... image: ElementType = ... page_break: ElementType = ... code_block: ElementType = ... block_quote: ElementType = ... footer: ElementType = ... header: ElementType = ... def __init__(self, value: int | str) -> None: ... class FormatMetadataPdfVariant(TypedDict): format_type: Literal["pdf"] _0: PdfMetadata class FormatMetadataDocxVariant(TypedDict): format_type: Literal["docx"] _0: DocxMetadata class FormatMetadataExcelVariant(TypedDict): format_type: Literal["excel"] _0: ExcelMetadata class FormatMetadataEmailVariant(TypedDict): format_type: Literal["email"] _0: EmailMetadata class FormatMetadataPptxVariant(TypedDict): format_type: Literal["pptx"] _0: PptxMetadata class FormatMetadataArchiveVariant(TypedDict): format_type: Literal["archive"] _0: ArchiveMetadata class FormatMetadataImageVariant(TypedDict): format_type: Literal["image"] _0: ImageMetadata class FormatMetadataXmlVariant(TypedDict): format_type: Literal["xml"] _0: XmlMetadata class FormatMetadataTextVariant(TypedDict): format_type: Literal["text"] _0: TextMetadata class FormatMetadataHtmlVariant(TypedDict): format_type: Literal["html"] _0: HtmlMetadata class FormatMetadataOcrVariant(TypedDict): format_type: Literal["ocr"] _0: OcrMetadata class FormatMetadataCsvVariant(TypedDict): format_type: Literal["csv"] _0: CsvMetadata class FormatMetadataBibtexVariant(TypedDict): format_type: Literal["bibtex"] _0: BibtexMetadata class FormatMetadataCitationVariant(TypedDict): format_type: Literal["citation"] _0: CitationMetadata class FormatMetadataFictionBookVariant(TypedDict): format_type: Literal["fiction_book"] _0: FictionBookMetadata class FormatMetadataDbfVariant(TypedDict): format_type: Literal["dbf"] _0: DbfMetadata class FormatMetadataJatsVariant(TypedDict): format_type: Literal["jats"] _0: JatsMetadata class FormatMetadataEpubVariant(TypedDict): format_type: Literal["epub"] _0: EpubMetadata class FormatMetadataPstVariant(TypedDict): format_type: Literal["pst"] _0: PstMetadata class FormatMetadataCodeVariant(TypedDict): format_type: Literal["code"] _0: str class FormatMetadata: format_type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class TextDirection: left_to_right: TextDirection = ... right_to_left: TextDirection = ... auto: TextDirection = ... def __init__(self, value: int | str) -> None: ... class LinkType: anchor: LinkType = ... internal: LinkType = ... external: LinkType = ... email: LinkType = ... phone: LinkType = ... other: LinkType = ... def __init__(self, value: int | str) -> None: ... class ImageType: data_uri: ImageType = ... inline_svg: ImageType = ... external: ImageType = ... relative: ImageType = ... def __init__(self, value: int | str) -> None: ... class StructuredDataType: json_ld: StructuredDataType = ... microdata: StructuredDataType = ... rd_fa: StructuredDataType = ... def __init__(self, value: int | str) -> None: ... class OcrBoundingGeometryRectangleVariant(TypedDict): type: Literal["rectangle"] left: int top: int width: int height: int class OcrBoundingGeometryQuadrilateralVariant(TypedDict): type: Literal["quadrilateral"] points: str class OcrBoundingGeometry: type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class OcrElementLevel: word: OcrElementLevel = ... line: OcrElementLevel = ... block: OcrElementLevel = ... page: OcrElementLevel = ... def __init__(self, value: int | str) -> None: ... class PageUnitType: page: PageUnitType = ... slide: PageUnitType = ... sheet: PageUnitType = ... def __init__(self, value: int | str) -> None: ... class DiffLineContextVariant(TypedDict): kind: Literal["context"] _0: str class DiffLineAddedVariant(TypedDict): kind: Literal["added"] _0: str class DiffLineRemovedVariant(TypedDict): kind: Literal["removed"] _0: str class DiffLine: kind: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class RevisionKind: insertion: RevisionKind = ... deletion: RevisionKind = ... format_change: RevisionKind = ... comment: RevisionKind = ... def __init__(self, value: int | str) -> None: ... class RevisionAnchorParagraphVariant(TypedDict): type: Literal["paragraph"] index: int class RevisionAnchorTableCellVariant(TypedDict): type: Literal["table_cell"] row: int col: int table_index: int class RevisionAnchorPageVariant(TypedDict): type: Literal["page"] index: int class RevisionAnchorSlideVariant(TypedDict): type: Literal["slide"] index: int class RevisionAnchorSheetVariant(TypedDict): type: Literal["sheet"] index: int name: str | None class RevisionAnchor: type: str def __str__(self) -> str: ... # noqa: PYI029 def __repr__(self) -> str: ... # noqa: PYI029 class UriKind: hyperlink: UriKind = ... image: UriKind = ... anchor: UriKind = ... citation: UriKind = ... reference: UriKind = ... email: UriKind = ... def __init__(self, value: int | str) -> None: ... class KeywordAlgorithm: yake: KeywordAlgorithm = ... rake: KeywordAlgorithm = ... def __init__(self, value: int | str) -> None: ... class PSMMode: osd_only: PSMMode = ... auto_osd: PSMMode = ... auto_only: PSMMode = ... auto: PSMMode = ... single_column: PSMMode = ... single_block_vertical: PSMMode = ... single_block: PSMMode = ... single_line: PSMMode = ... single_word: PSMMode = ... circle_word: PSMMode = ... single_char: PSMMode = ... def __init__(self, value: int | str) -> None: ... class PaddleLanguage: english: PaddleLanguage = ... chinese: PaddleLanguage = ... japanese: PaddleLanguage = ... korean: PaddleLanguage = ... german: PaddleLanguage = ... french: PaddleLanguage = ... latin: PaddleLanguage = ... cyrillic: PaddleLanguage = ... traditional_chinese: PaddleLanguage = ... thai: PaddleLanguage = ... greek: PaddleLanguage = ... east_slavic: PaddleLanguage = ... arabic: PaddleLanguage = ... devanagari: PaddleLanguage = ... tamil: PaddleLanguage = ... telugu: PaddleLanguage = ... def __init__(self, value: int | str) -> None: ... class LayoutClass: caption: LayoutClass = ... footnote: LayoutClass = ... formula: LayoutClass = ... list_item: LayoutClass = ... page_footer: LayoutClass = ... page_header: LayoutClass = ... picture: LayoutClass = ... section_header: LayoutClass = ... table: LayoutClass = ... text: LayoutClass = ... title_: LayoutClass = ... document_index: LayoutClass = ... code: LayoutClass = ... checkbox_selected: LayoutClass = ... checkbox_unselected: LayoutClass = ... form: LayoutClass = ... key_value_region: LayoutClass = ... def __init__(self, value: int | str) -> None: ... async def extract_bytes( content: bytes, mime_type: str, config: ExtractionConfig, ) -> ExtractionResult: ... async def extract_file( path: str, config: ExtractionConfig, mime_type: str | None = None, ) -> ExtractionResult: ... def extract_file_sync( path: str, config: ExtractionConfig, mime_type: str | None = None, ) -> ExtractionResult: ... def extract_bytes_sync( content: bytes, mime_type: str, config: ExtractionConfig, ) -> ExtractionResult: ... def batch_extract_files_sync( items: list[BatchFileItem], config: ExtractionConfig, ) -> list[ExtractionResult]: ... def batch_extract_bytes_sync( items: list[BatchBytesItem], config: ExtractionConfig, ) -> list[ExtractionResult]: ... async def batch_extract_files( items: list[BatchFileItem], config: ExtractionConfig, ) -> list[ExtractionResult]: ... async def batch_extract_bytes( items: list[BatchBytesItem], config: ExtractionConfig, ) -> list[ExtractionResult]: ... def detect_mime_type_from_bytes(content: bytes) -> str: ... def get_extensions_for_mime(mime_type: str) -> list[str]: ... def clear_embedding_backends() -> None: ... def list_embedding_backends() -> list[str]: ... def list_document_extractors() -> list[str]: ... def clear_document_extractors() -> None: ... def list_ocr_backends() -> list[str]: ... def clear_ocr_backends() -> None: ... def list_post_processors() -> list[str]: ... def clear_post_processors() -> None: ... def list_renderers() -> list[str]: ... def clear_renderers() -> None: ... def list_validators() -> list[str]: ... def clear_validators() -> None: ... def compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) -> ExtractionDiff: ... async def embed_texts_async(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ... def render_pdf_page_to_png( pdf_bytes: bytes, page_index: int, dpi: int | None = None, password: str | None = None, ) -> bytes: ... def detect_mime_type(path: str, check_exists: bool) -> str: ... def embed_texts(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ... def get_embedding_preset(name: str) -> EmbeddingPreset | None: ... def list_embedding_presets() -> list[str]: ... def register_ocr_backend(backend: object) -> None: ... def unregister_ocr_backend(name: str) -> None: ... def clear_ocr_backends() -> None: ... def register_post_processor(backend: object) -> None: ... def unregister_post_processor(name: str) -> None: ... def clear_post_processors() -> None: ... def register_validator(backend: object) -> None: ... def unregister_validator(name: str) -> None: ... def clear_validators() -> None: ... def register_embedding_backend(backend: object) -> None: ... def unregister_embedding_backend(name: str) -> None: ... def clear_embedding_backends() -> None: ... def register_document_extractor(backend: object) -> None: ... def unregister_document_extractor(name: str) -> None: ... def clear_document_extractors() -> None: ... def register_renderer(backend: object) -> None: ... def unregister_renderer(name: str) -> None: ... def clear_renderers() -> None: ...