Files
fil/packages/python/kreuzberg/_kreuzberg.pyi
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

3058 lines
87 KiB
Python
Generated

# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
from typing import Any, Literal, TypedDict
class CacheStats:
total_files: int
total_size_mb: float
available_space_mb: float
oldest_file_age_days: float
newest_file_age_days: float
def __init__(
self,
total_files: int,
total_size_mb: float,
available_space_mb: float,
oldest_file_age_days: float,
newest_file_age_days: float,
) -> None: ...
class AccelerationConfig:
provider: ExecutionProviderType
device_id: int
def __init__(
self,
provider: ExecutionProviderType | str | None = None,
device_id: int | None = None,
) -> None: ...
class ContentFilterConfig:
include_headers: bool
include_footers: bool
strip_repeating_text: bool
include_watermarks: bool
def __init__(
self,
include_headers: bool | None = None,
include_footers: bool | None = None,
strip_repeating_text: bool | None = None,
include_watermarks: bool | None = None,
) -> None: ...
@staticmethod
def default() -> ContentFilterConfig: ...
class EmailConfig:
msg_fallback_codepage: int | None
def __init__(self, msg_fallback_codepage: int | None = None) -> None: ...
class ExtractionConfig:
use_cache: bool
enable_quality_processing: bool
ocr: OcrConfig | None
force_ocr: bool
force_ocr_pages: list[int] | None
disable_ocr: bool
chunking: ChunkingConfig | None
content_filter: ContentFilterConfig | None
images: ImageExtractionConfig | None
pdf_options: PdfConfig | None
token_reduction: TokenReductionOptions | None
language_detection: LanguageDetectionConfig | None
pages: PageConfig | None
keywords: KeywordConfig | None
postprocessor: PostProcessorConfig | None
html_options: str | None
html_output: HtmlOutputConfig | None
extraction_timeout_secs: int | None
max_concurrent_extractions: int | None
result_format: ResultFormat
security_limits: SecurityLimits | None
max_embedded_file_bytes: int | None
output_format: OutputFormat
layout: LayoutDetectionConfig | None
use_layout_for_markdown: bool
include_document_structure: bool
acceleration: AccelerationConfig | None
cache_namespace: str | None
cache_ttl_secs: int | None
email: EmailConfig | None
concurrency: str | None
max_archive_depth: int
tree_sitter: TreeSitterConfig | None
structured_extraction: StructuredExtractionConfig | None
cancel_token: str | None
def __init__(
self,
use_cache: bool | None = None,
enable_quality_processing: bool | None = None,
ocr: OcrConfig | None = None,
force_ocr: bool | None = None,
force_ocr_pages: list[int] | None = None,
disable_ocr: bool | None = None,
chunking: ChunkingConfig | None = None,
content_filter: ContentFilterConfig | None = None,
images: ImageExtractionConfig | None = None,
token_reduction: TokenReductionOptions | None = None,
language_detection: LanguageDetectionConfig | None = None,
pages: PageConfig | None = None,
postprocessor: PostProcessorConfig | None = None,
extraction_timeout_secs: int | None = None,
max_concurrent_extractions: int | None = None,
result_format: ResultFormat | str | None = None,
security_limits: SecurityLimits | None = None,
max_embedded_file_bytes: int | None = None,
output_format: OutputFormat | None = None,
use_layout_for_markdown: bool | None = None,
include_document_structure: bool | None = None,
acceleration: AccelerationConfig | None = None,
cache_namespace: str | None = None,
cache_ttl_secs: int | None = None,
email: EmailConfig | None = None,
concurrency: str | None = None,
max_archive_depth: int | None = None,
structured_extraction: StructuredExtractionConfig | None = None,
cancel_token: str | None = None,
) -> None: ...
def needs_image_processing(self) -> bool: ...
@staticmethod
def default() -> ExtractionConfig: ...
class FileExtractionConfig:
enable_quality_processing: bool | None
ocr: OcrConfig | None
force_ocr: bool | None
force_ocr_pages: list[int] | None
disable_ocr: bool | None
chunking: ChunkingConfig | None
content_filter: ContentFilterConfig | None
images: ImageExtractionConfig | None
pdf_options: PdfConfig | None
token_reduction: TokenReductionOptions | None
language_detection: LanguageDetectionConfig | None
pages: PageConfig | None
keywords: KeywordConfig | None
postprocessor: PostProcessorConfig | None
html_options: str | None
result_format: ResultFormat | None
output_format: OutputFormat | None
include_document_structure: bool | None
layout: LayoutDetectionConfig | None
timeout_secs: int | None
tree_sitter: TreeSitterConfig | None
structured_extraction: StructuredExtractionConfig | None
def __init__(
self,
enable_quality_processing: bool | None = None,
ocr: OcrConfig | None = None,
force_ocr: bool | None = None,
force_ocr_pages: list[int] | None = None,
disable_ocr: bool | None = None,
chunking: ChunkingConfig | None = None,
content_filter: ContentFilterConfig | None = None,
images: ImageExtractionConfig | None = None,
token_reduction: TokenReductionOptions | None = None,
language_detection: LanguageDetectionConfig | None = None,
pages: PageConfig | None = None,
postprocessor: PostProcessorConfig | None = None,
result_format: ResultFormat | str | None = None,
output_format: OutputFormat | None = None,
include_document_structure: bool | None = None,
timeout_secs: int | None = None,
structured_extraction: StructuredExtractionConfig | None = None,
) -> None: ...
class BatchBytesItem:
content: bytes
mime_type: str
config: FileExtractionConfig | None
def __init__(
self,
content: bytes,
mime_type: str,
config: FileExtractionConfig | None = None,
) -> None: ...
class BatchFileItem:
path: str
config: FileExtractionConfig | None
def __init__(self, path: str, config: FileExtractionConfig | None = None) -> None: ...
class ImageExtractionConfig:
extract_images: bool
target_dpi: int
max_image_dimension: int
inject_placeholders: bool
auto_adjust_dpi: bool
min_dpi: int
max_dpi: int
max_images_per_page: int | None
classify: bool
include_page_rasters: bool
run_ocr_on_images: bool
ocr_text_only: bool
append_ocr_text: bool
def __init__(
self,
extract_images: bool | None = None,
target_dpi: int | None = None,
max_image_dimension: int | None = None,
inject_placeholders: bool | None = None,
auto_adjust_dpi: bool | None = None,
min_dpi: int | None = None,
max_dpi: int | None = None,
max_images_per_page: int | None = None,
classify: bool | None = None,
include_page_rasters: bool | None = None,
run_ocr_on_images: bool | None = None,
ocr_text_only: bool | None = None,
append_ocr_text: bool | None = None,
) -> None: ...
@staticmethod
def default() -> ImageExtractionConfig: ...
class TokenReductionOptions:
mode: str
preserve_important_words: bool
def __init__(
self,
mode: str | None = None,
preserve_important_words: bool | None = None,
) -> None: ...
@staticmethod
def default() -> TokenReductionOptions: ...
class LanguageDetectionConfig:
enabled: bool
min_confidence: float
detect_multiple: bool
def __init__(
self,
enabled: bool | None = None,
min_confidence: float | None = None,
detect_multiple: bool | None = None,
) -> None: ...
@staticmethod
def default() -> LanguageDetectionConfig: ...
class HtmlOutputConfig:
css: str | None
css_file: str | None
theme: HtmlTheme
class_prefix: str
embed_css: bool
def __init__(
self,
css: str | None = None,
css_file: str | None = None,
theme: HtmlTheme | str | None = None,
class_prefix: str | None = None,
embed_css: bool | None = None,
) -> None: ...
@staticmethod
def default() -> HtmlOutputConfig: ...
class LayoutDetectionConfig:
confidence_threshold: float | None
apply_heuristics: bool
table_model: TableModel
acceleration: AccelerationConfig | None
def __init__(
self,
confidence_threshold: float | None = None,
apply_heuristics: bool | None = None,
table_model: TableModel | str | None = None,
acceleration: AccelerationConfig | None = None,
) -> None: ...
@staticmethod
def default() -> LayoutDetectionConfig: ...
class LlmConfig:
model: str
api_key: str | None
base_url: str | None
timeout_secs: int | None
max_retries: int | None
temperature: float | None
max_tokens: int | None
def __init__(
self,
model: str | None = None,
api_key: str | None = None,
base_url: str | None = None,
timeout_secs: int | None = None,
max_retries: int | None = None,
temperature: float | None = None,
max_tokens: int | None = None,
) -> None: ...
class StructuredExtractionConfig:
schema: dict[str, Any]
schema_name: str
schema_description: str | None
strict: bool
prompt: str | None
llm: LlmConfig
def __init__(
self,
schema: dict[str, Any],
schema_name: str,
strict: bool,
llm: LlmConfig,
schema_description: str | None = None,
prompt: str | None = None,
) -> None: ...
class OcrQualityThresholds:
min_total_non_whitespace: int
min_non_whitespace_per_page: float
min_meaningful_word_len: int
min_meaningful_words: int
min_alnum_ratio: float
min_garbage_chars: int
max_fragmented_word_ratio: float
critical_fragmented_word_ratio: float
min_avg_word_length: float
min_words_for_avg_length_check: int
min_consecutive_repeat_ratio: float
min_words_for_repeat_check: int
substantive_min_chars: int
non_text_min_chars: int
alnum_ws_ratio_threshold: float
pipeline_min_quality: float
def __init__(
self,
min_total_non_whitespace: int | None = None,
min_non_whitespace_per_page: float | None = None,
min_meaningful_word_len: int | None = None,
min_meaningful_words: int | None = None,
min_alnum_ratio: float | None = None,
min_garbage_chars: int | None = None,
max_fragmented_word_ratio: float | None = None,
critical_fragmented_word_ratio: float | None = None,
min_avg_word_length: float | None = None,
min_words_for_avg_length_check: int | None = None,
min_consecutive_repeat_ratio: float | None = None,
min_words_for_repeat_check: int | None = None,
substantive_min_chars: int | None = None,
non_text_min_chars: int | None = None,
alnum_ws_ratio_threshold: float | None = None,
pipeline_min_quality: float | None = None,
) -> None: ...
@staticmethod
def default() -> OcrQualityThresholds: ...
class OcrPipelineStage:
backend: str
priority: int
language: str | None
tesseract_config: TesseractConfig | None
paddle_ocr_config: dict[str, Any] | None
vlm_config: LlmConfig | None
backend_options: dict[str, Any] | None
def __init__(
self,
backend: str,
priority: int,
language: str | None = None,
tesseract_config: TesseractConfig | None = None,
paddle_ocr_config: dict[str, Any] | None = None,
vlm_config: LlmConfig | None = None,
backend_options: dict[str, Any] | None = None,
) -> None: ...
class OcrPipelineConfig:
stages: list[OcrPipelineStage]
quality_thresholds: OcrQualityThresholds
def __init__(
self,
stages: list[OcrPipelineStage],
quality_thresholds: OcrQualityThresholds,
) -> None: ...
class OcrConfig:
enabled: bool
backend: str
language: str
tesseract_config: TesseractConfig | None
output_format: OutputFormat | None
paddle_ocr_config: dict[str, Any] | None
backend_options: dict[str, Any] | None
element_config: OcrElementConfig | None
quality_thresholds: OcrQualityThresholds | None
pipeline: OcrPipelineConfig | None
auto_rotate: bool
vlm_config: LlmConfig | None
vlm_prompt: str | None
acceleration: AccelerationConfig | None
tessdata_bytes: dict[str, bytes] | None
def __init__(
self,
enabled: bool | None = None,
backend: str | None = None,
language: str | None = None,
tesseract_config: TesseractConfig | None = None,
output_format: OutputFormat | None = None,
paddle_ocr_config: dict[str, Any] | None = None,
backend_options: dict[str, Any] | None = None,
element_config: OcrElementConfig | None = None,
quality_thresholds: OcrQualityThresholds | None = None,
pipeline: OcrPipelineConfig | None = None,
auto_rotate: bool | None = None,
vlm_config: LlmConfig | None = None,
vlm_prompt: str | None = None,
acceleration: AccelerationConfig | None = None,
tessdata_bytes: dict[str, bytes] | None = None,
) -> None: ...
@staticmethod
def default() -> OcrConfig: ...
class PageConfig:
extract_pages: bool
insert_page_markers: bool
marker_format: str
def __init__(
self,
extract_pages: bool | None = None,
insert_page_markers: bool | None = None,
marker_format: str | None = None,
) -> None: ...
@staticmethod
def default() -> PageConfig: ...
class PdfConfig:
extract_images: bool
extract_tables: bool
passwords: list[str] | None
extract_metadata: bool
hierarchy: HierarchyConfig | None
extract_annotations: bool
top_margin_fraction: float | None
bottom_margin_fraction: float | None
allow_single_column_tables: bool
ocr_inline_images: bool
def __init__(
self,
extract_images: bool | None = None,
extract_tables: bool | None = None,
passwords: list[str] | None = None,
extract_metadata: bool | None = None,
hierarchy: HierarchyConfig | None = None,
extract_annotations: bool | None = None,
top_margin_fraction: float | None = None,
bottom_margin_fraction: float | None = None,
allow_single_column_tables: bool | None = None,
ocr_inline_images: bool | None = None,
) -> None: ...
@staticmethod
def default() -> PdfConfig: ...
class HierarchyConfig:
enabled: bool
k_clusters: int
include_bbox: bool
ocr_coverage_threshold: float | None
def __init__(
self,
enabled: bool | None = None,
k_clusters: int | None = None,
include_bbox: bool | None = None,
ocr_coverage_threshold: float | None = None,
) -> None: ...
@staticmethod
def default() -> HierarchyConfig: ...
class PostProcessorConfig:
enabled: bool
enabled_processors: list[str] | None
disabled_processors: list[str] | None
enabled_set: list[str] | None
disabled_set: list[str] | None
def __init__(
self,
enabled: bool | None = None,
enabled_processors: list[str] | None = None,
disabled_processors: list[str] | None = None,
enabled_set: list[str] | None = None,
disabled_set: list[str] | None = None,
) -> None: ...
@staticmethod
def default() -> PostProcessorConfig: ...
class ChunkingConfig:
max_characters: int
overlap: int
trim: bool
chunker_type: ChunkerType
embedding: EmbeddingConfig | None
preset: str | None
sizing: ChunkSizing
prepend_heading_context: bool
topic_threshold: float | None
def __init__(
self,
max_characters: int | None = None,
overlap: int | None = None,
trim: bool | None = None,
chunker_type: ChunkerType | str | None = None,
embedding: EmbeddingConfig | None = None,
preset: str | None = None,
sizing: ChunkSizing | None = None,
prepend_heading_context: bool | None = None,
topic_threshold: float | None = None,
) -> None: ...
@staticmethod
def default() -> ChunkingConfig: ...
class EmbeddingConfig:
model: EmbeddingModelType
normalize: bool
batch_size: int
show_download_progress: bool
cache_dir: str | None
acceleration: AccelerationConfig | None
max_embed_duration_secs: int | None
def __init__(
self,
model: EmbeddingModelType | None = None,
normalize: bool | None = None,
batch_size: int | None = None,
show_download_progress: bool | None = None,
cache_dir: str | None = None,
acceleration: AccelerationConfig | None = None,
max_embed_duration_secs: int | None = None,
) -> None: ...
@staticmethod
def default() -> EmbeddingConfig: ...
class TreeSitterConfig:
enabled: bool
cache_dir: str | None
languages: list[str] | None
groups: list[str] | None
process: TreeSitterProcessConfig
def __init__(
self,
enabled: bool | None = None,
cache_dir: str | None = None,
languages: list[str] | None = None,
groups: list[str] | None = None,
process: TreeSitterProcessConfig | None = None,
) -> None: ...
@staticmethod
def default() -> TreeSitterConfig: ...
class TreeSitterProcessConfig:
structure: bool
imports: bool
exports: bool
comments: bool
docstrings: bool
symbols: bool
diagnostics: bool
chunk_max_size: int | None
content_mode: CodeContentMode
def __init__(
self,
structure: bool | None = None,
imports: bool | None = None,
exports: bool | None = None,
comments: bool | None = None,
docstrings: bool | None = None,
symbols: bool | None = None,
diagnostics: bool | None = None,
chunk_max_size: int | None = None,
content_mode: CodeContentMode | str | None = None,
) -> None: ...
@staticmethod
def default() -> TreeSitterProcessConfig: ...
class SupportedFormat:
extension: str
mime_type: str
def __init__(self, extension: str, mime_type: str) -> None: ...
class ServerConfig:
host: str
port: int
cors_origins: list[str]
max_request_body_bytes: int
max_multipart_field_bytes: int
def __init__(
self,
host: str | None = None,
port: int | None = None,
cors_origins: list[str] | None = None,
max_request_body_bytes: int | None = None,
max_multipart_field_bytes: int | None = None,
) -> None: ...
def listen_addr(self) -> str: ...
def cors_allows_all(self) -> bool: ...
def is_origin_allowed(self, origin: str) -> bool: ...
def max_request_body_mb(self) -> int: ...
def max_multipart_field_mb(self) -> int: ...
@staticmethod
def default() -> ServerConfig: ...
class StructuredDataResult:
content: str
format: str
metadata: dict[str, str]
text_fields: list[str]
def __init__(
self,
content: str,
format: str, # noqa: A002
metadata: dict[str, str],
text_fields: list[str],
) -> None: ...
class DocxAppProperties:
application: str | None
app_version: str | None
template: str | None
total_time: int | None
pages: int | None
words: int | None
characters: int | None
characters_with_spaces: int | None
lines: int | None
paragraphs: int | None
company: str | None
doc_security: int | None
scale_crop: bool | None
links_up_to_date: bool | None
shared_doc: bool | None
hyperlinks_changed: bool | None
def __init__(
self,
application: str | None = None,
app_version: str | None = None,
template: str | None = None,
total_time: int | None = None,
pages: int | None = None,
words: int | None = None,
characters: int | None = None,
characters_with_spaces: int | None = None,
lines: int | None = None,
paragraphs: int | None = None,
company: str | None = None,
doc_security: int | None = None,
scale_crop: bool | None = None,
links_up_to_date: bool | None = None,
shared_doc: bool | None = None,
hyperlinks_changed: bool | None = None,
) -> None: ...
class XlsxAppProperties:
application: str | None
app_version: str | None
doc_security: int | None
scale_crop: bool | None
links_up_to_date: bool | None
shared_doc: bool | None
hyperlinks_changed: bool | None
company: str | None
worksheet_names: list[str]
def __init__(
self,
application: str | None = None,
app_version: str | None = None,
doc_security: int | None = None,
scale_crop: bool | None = None,
links_up_to_date: bool | None = None,
shared_doc: bool | None = None,
hyperlinks_changed: bool | None = None,
company: str | None = None,
worksheet_names: list[str] | None = None,
) -> None: ...
class PptxAppProperties:
application: str | None
app_version: str | None
total_time: int | None
company: str | None
doc_security: int | None
scale_crop: bool | None
links_up_to_date: bool | None
shared_doc: bool | None
hyperlinks_changed: bool | None
slides: int | None
notes: int | None
hidden_slides: int | None
multimedia_clips: int | None
presentation_format: str | None
slide_titles: list[str]
def __init__(
self,
application: str | None = None,
app_version: str | None = None,
total_time: int | None = None,
company: str | None = None,
doc_security: int | None = None,
scale_crop: bool | None = None,
links_up_to_date: bool | None = None,
shared_doc: bool | None = None,
hyperlinks_changed: bool | None = None,
slides: int | None = None,
notes: int | None = None,
hidden_slides: int | None = None,
multimedia_clips: int | None = None,
presentation_format: str | None = None,
slide_titles: list[str] | None = None,
) -> None: ...
class CoreProperties:
title: str | None
subject: str | None
creator: str | None
keywords: str | None
description: str | None
last_modified_by: str | None
revision: str | None
created: str | None
modified: str | None
category: str | None
content_status: str | None
language: str | None
identifier: str | None
version: str | None
last_printed: str | None
def __init__(
self,
title: str | None = None,
subject: str | None = None,
creator: str | None = None,
keywords: str | None = None,
description: str | None = None,
last_modified_by: str | None = None,
revision: str | None = None,
created: str | None = None,
modified: str | None = None,
category: str | None = None,
content_status: str | None = None,
language: str | None = None,
identifier: str | None = None,
version: str | None = None,
last_printed: str | None = None,
) -> None: ...
class SecurityLimits:
max_archive_size: int
max_compression_ratio: int
max_files_in_archive: int
max_nesting_depth: int
max_entity_length: int
max_content_size: int
max_iterations: int
max_xml_depth: int
max_table_cells: int
def __init__(
self,
max_archive_size: int | None = None,
max_compression_ratio: int | None = None,
max_files_in_archive: int | None = None,
max_nesting_depth: int | None = None,
max_entity_length: int | None = None,
max_content_size: int | None = None,
max_iterations: int | None = None,
max_xml_depth: int | None = None,
max_table_cells: int | None = None,
) -> None: ...
@staticmethod
def default() -> SecurityLimits: ...
class TokenReductionConfig:
level: ReductionLevel
language_hint: str | None
preserve_markdown: bool
preserve_code: bool
semantic_threshold: float
enable_parallel: bool
use_simd: bool
custom_stopwords: dict[str, list[str]] | None
preserve_patterns: list[str]
target_reduction: float | None
enable_semantic_clustering: bool
def __init__(
self,
level: ReductionLevel | str | None = None,
language_hint: str | None = None,
preserve_markdown: bool | None = None,
preserve_code: bool | None = None,
semantic_threshold: float | None = None,
enable_parallel: bool | None = None,
use_simd: bool | None = None,
custom_stopwords: dict[str, list[str]] | None = None,
preserve_patterns: list[str] | None = None,
target_reduction: float | None = None,
enable_semantic_clustering: bool | None = None,
) -> None: ...
@staticmethod
def default() -> TokenReductionConfig: ...
class PdfAnnotation:
annotation_type: PdfAnnotationType
content: str | None
page_number: int
bounding_box: BoundingBox | None
def __init__(
self,
annotation_type: PdfAnnotationType | str,
page_number: int,
content: str | None = None,
bounding_box: BoundingBox | None = None,
) -> None: ...
class DjotContent:
plain_text: str
blocks: list[FormattedBlock]
metadata: Metadata
tables: list[Table]
images: list[DjotImage]
links: list[DjotLink]
footnotes: list[Footnote]
attributes: list[str]
def __init__(
self,
plain_text: str,
blocks: list[FormattedBlock],
metadata: Metadata,
tables: list[Table],
images: list[DjotImage],
links: list[DjotLink],
footnotes: list[Footnote],
attributes: list[str],
) -> None: ...
class FormattedBlock:
block_type: BlockType
level: int | None
inline_content: list[InlineElement]
attributes: str | None
language: str | None
code: str | None
children: list[FormattedBlock]
def __init__(
self,
block_type: BlockType | str,
inline_content: list[InlineElement],
children: list[FormattedBlock],
level: int | None = None,
attributes: str | None = None,
language: str | None = None,
code: str | None = None,
) -> None: ...
class InlineElement:
element_type: InlineType
content: str
attributes: str | None
metadata: dict[str, str] | None
def __init__(
self,
element_type: InlineType | str,
content: str,
attributes: str | None = None,
metadata: dict[str, str] | None = None,
) -> None: ...
class DjotImage:
src: str
alt: str
title: str | None
attributes: str | None
def __init__(
self,
src: str,
alt: str,
title: str | None = None,
attributes: str | None = None,
) -> None: ...
class DjotLink:
url: str
text: str
title: str | None
attributes: str | None
def __init__(
self,
url: str,
text: str,
title: str | None = None,
attributes: str | None = None,
) -> None: ...
class Footnote:
label: str
content: list[FormattedBlock]
def __init__(self, label: str, content: list[FormattedBlock]) -> None: ...
class DocumentStructure:
nodes: list[DocumentNode]
source_format: str | None
relationships: list[DocumentRelationship]
node_types: list[str]
def __init__(
self,
nodes: list[DocumentNode] | None = None,
source_format: str | None = None,
relationships: list[DocumentRelationship] | None = None,
node_types: list[str] | None = None,
) -> None: ...
def finalize_node_types(self) -> None: ...
def is_empty(self) -> bool: ...
@staticmethod
def default() -> DocumentStructure: ...
class DocumentRelationship:
source: int
target: int
kind: RelationshipKind
def __init__(self, source: int, target: int, kind: RelationshipKind | str) -> None: ...
class DocumentNode:
id: str
content: NodeContent
parent: int | None
children: list[int]
content_layer: ContentLayer
page: int | None
page_end: int | None
bbox: BoundingBox | None
annotations: list[TextAnnotation]
attributes: dict[str, str] | None
def __init__(
self,
id: str, # noqa: A002
content: NodeContent,
children: list[int],
content_layer: ContentLayer | str,
annotations: list[TextAnnotation],
parent: int | None = None,
page: int | None = None,
page_end: int | None = None,
bbox: BoundingBox | None = None,
attributes: dict[str, str] | None = None,
) -> None: ...
class TableGrid:
rows: int
cols: int
cells: list[GridCell]
def __init__(
self,
rows: int | None = None,
cols: int | None = None,
cells: list[GridCell] | None = None,
) -> None: ...
class GridCell:
content: str
row: int
col: int
row_span: int
col_span: int
is_header: bool
bbox: BoundingBox | None
def __init__(
self,
content: str,
row: int,
col: int,
row_span: int,
col_span: int,
is_header: bool,
bbox: BoundingBox | None = None,
) -> None: ...
class TextAnnotation:
start: int
end: int
kind: AnnotationKind
def __init__(self, start: int, end: int, kind: AnnotationKind) -> None: ...
class ExtractionResult:
content: str
mime_type: str
metadata: Metadata
extraction_method: ExtractionMethod | None
tables: list[Table]
detected_languages: list[str] | None
chunks: list[Chunk] | None
images: list[ExtractedImage] | None
pages: list[PageContent] | None
elements: list[Element] | None
djot_content: DjotContent | None
ocr_elements: list[OcrElement] | None
document: DocumentStructure | None
extracted_keywords: list[Keyword] | None
quality_score: float | None
processing_warnings: list[ProcessingWarning]
annotations: list[PdfAnnotation] | None
children: list[ArchiveEntry] | None
uris: list[ExtractedUri] | None
revisions: list[DocumentRevision] | None
structured_output: dict[str, Any] | None
code_intelligence: dict[str, Any] | None
llm_usage: list[LlmUsage] | None
formatted_content: str | None
ocr_internal_document: str | None
def __init__(
self,
content: str | None = None,
mime_type: str | None = None,
metadata: Metadata | None = None,
extraction_method: ExtractionMethod | str | None = None,
tables: list[Table] | None = None,
detected_languages: list[str] | None = None,
chunks: list[Chunk] | None = None,
images: list[ExtractedImage] | None = None,
pages: list[PageContent] | None = None,
elements: list[Element] | None = None,
djot_content: DjotContent | None = None,
ocr_elements: list[OcrElement] | None = None,
document: DocumentStructure | None = None,
quality_score: float | None = None,
processing_warnings: list[ProcessingWarning] | None = None,
annotations: list[PdfAnnotation] | None = None,
children: list[ArchiveEntry] | None = None,
uris: list[ExtractedUri] | None = None,
revisions: list[DocumentRevision] | None = None,
structured_output: dict[str, Any] | None = None,
llm_usage: list[LlmUsage] | None = None,
formatted_content: str | None = None,
ocr_internal_document: str | None = None,
) -> None: ...
@staticmethod
def from_ocr(ocr: OcrExtractionResult) -> ExtractionResult: ...
class ArchiveEntry:
path: str
mime_type: str
result: ExtractionResult
def __init__(self, path: str, mime_type: str, result: ExtractionResult) -> None: ...
class ProcessingWarning:
source: str
message: str
def __init__(self, source: str, message: str) -> None: ...
class LlmUsage:
model: str
source: str
input_tokens: int | None
output_tokens: int | None
total_tokens: int | None
estimated_cost: float | None
finish_reason: str | None
def __init__(
self,
model: str | None = None,
source: str | None = None,
input_tokens: int | None = None,
output_tokens: int | None = None,
total_tokens: int | None = None,
estimated_cost: float | None = None,
finish_reason: str | None = None,
) -> None: ...
class Chunk:
content: str
chunk_type: ChunkType
embedding: list[float] | None
metadata: ChunkMetadata
def __init__(
self,
content: str,
chunk_type: ChunkType | str,
metadata: ChunkMetadata,
embedding: list[float] | None = None,
) -> None: ...
class HeadingContext:
headings: list[HeadingLevel]
def __init__(self, headings: list[HeadingLevel]) -> None: ...
class HeadingLevel:
level: int
text: str
def __init__(self, level: int, text: str) -> None: ...
class ChunkMetadata:
byte_start: int
byte_end: int
token_count: int | None
chunk_index: int
total_chunks: int
first_page: int | None
last_page: int | None
heading_context: HeadingContext | None
image_indices: list[int]
def __init__(
self,
byte_start: int,
byte_end: int,
chunk_index: int,
total_chunks: int,
image_indices: list[int],
token_count: int | None = None,
first_page: int | None = None,
last_page: int | None = None,
heading_context: HeadingContext | None = None,
) -> None: ...
class ExtractedImage:
data: bytes
format: str
image_index: int
page_number: int | None
width: int | None
height: int | None
colorspace: str | None
bits_per_component: int | None
is_mask: bool
description: str | None
ocr_result: ExtractionResult | None
bounding_box: BoundingBox | None
source_path: str | None
image_kind: ImageKind | None
kind_confidence: float | None
cluster_id: int | None
def __init__(
self,
data: bytes,
format: str, # noqa: A002
image_index: int,
is_mask: bool,
page_number: int | None = None,
width: int | None = None,
height: int | None = None,
colorspace: str | None = None,
bits_per_component: int | None = None,
description: str | None = None,
ocr_result: ExtractionResult | None = None,
bounding_box: BoundingBox | None = None,
source_path: str | None = None,
image_kind: ImageKind | str | None = None,
kind_confidence: float | None = None,
cluster_id: int | None = None,
) -> None: ...
class BoundingBox:
x0: float
y0: float
x1: float
y1: float
def __init__(
self,
x0: float | None = None,
y0: float | None = None,
x1: float | None = None,
y1: float | None = None,
) -> None: ...
class ElementMetadata:
page_number: int | None
filename: str | None
coordinates: BoundingBox | None
element_index: int | None
additional: dict[str, str]
def __init__(
self,
additional: dict[str, str],
page_number: int | None = None,
filename: str | None = None,
coordinates: BoundingBox | None = None,
element_index: int | None = None,
) -> None: ...
class Element:
element_id: str
element_type: ElementType
text: str
metadata: ElementMetadata
def __init__(
self,
element_id: str,
element_type: ElementType | str,
text: str,
metadata: ElementMetadata,
) -> None: ...
class ExcelWorkbook:
sheets: list[ExcelSheet]
metadata: dict[str, str]
revisions: list[DocumentRevision] | None
def __init__(
self,
sheets: list[ExcelSheet],
metadata: dict[str, str],
revisions: list[DocumentRevision] | None = None,
) -> None: ...
class ExcelSheet:
name: str
markdown: str
row_count: int
col_count: int
cell_count: int
table_cells: list[list[str]] | None
def __init__(
self,
name: str,
markdown: str,
row_count: int,
col_count: int,
cell_count: int,
table_cells: list[list[str]] | None = None,
) -> None: ...
class XmlExtractionResult:
content: str
element_count: int
unique_elements: list[str]
def __init__(self, content: str, element_count: int, unique_elements: list[str]) -> None: ...
class TextExtractionResult:
content: str
line_count: int
word_count: int
character_count: int
headers: list[str] | None
links: list[list[str]] | None
code_blocks: list[list[str]] | None
def __init__(
self,
content: str,
line_count: int,
word_count: int,
character_count: int,
headers: list[str] | None = None,
links: list[list[str]] | None = None,
code_blocks: list[list[str]] | None = None,
) -> None: ...
class PptxExtractionResult:
content: str
metadata: PptxMetadata
slide_count: int
image_count: int
table_count: int
images: list[ExtractedImage]
page_structure: PageStructure | None
page_contents: list[PageContent] | None
document: DocumentStructure | None
hyperlinks: list[str]
office_metadata: dict[str, str]
revisions: list[DocumentRevision] | None
def __init__(
self,
content: str,
metadata: PptxMetadata,
slide_count: int,
image_count: int,
table_count: int,
images: list[ExtractedImage],
hyperlinks: list[str],
office_metadata: dict[str, str],
page_structure: PageStructure | None = None,
page_contents: list[PageContent] | None = None,
document: DocumentStructure | None = None,
revisions: list[DocumentRevision] | None = None,
) -> None: ...
class EmailExtractionResult:
subject: str | None
from_email: str | None
to_emails: list[str]
cc_emails: list[str]
bcc_emails: list[str]
date: str | None
message_id: str | None
plain_text: str | None
html_content: str | None
content: str
attachments: list[EmailAttachment]
metadata: dict[str, str]
def __init__(
self,
to_emails: list[str],
cc_emails: list[str],
bcc_emails: list[str],
content: str,
attachments: list[EmailAttachment],
metadata: dict[str, str],
subject: str | None = None,
from_email: str | None = None,
date: str | None = None,
message_id: str | None = None,
plain_text: str | None = None,
html_content: str | None = None,
) -> None: ...
class EmailAttachment:
name: str | None
filename: str | None
mime_type: str | None
size: int | None
is_image: bool
data: bytes | None
def __init__(
self,
is_image: bool,
name: str | None = None,
filename: str | None = None,
mime_type: str | None = None,
size: int | None = None,
data: bytes | None = None,
) -> None: ...
class OcrExtractionResult:
content: str
mime_type: str
metadata: dict[str, dict[str, Any]]
tables: list[OcrTable]
ocr_elements: list[OcrElement] | None
internal_document: str | None
def __init__(
self,
content: str,
mime_type: str,
metadata: dict[str, dict[str, Any]],
tables: list[OcrTable],
ocr_elements: list[OcrElement] | None = None,
internal_document: str | None = None,
) -> None: ...
class OcrTable:
cells: list[list[str]]
markdown: str
page_number: int
bounding_box: OcrTableBoundingBox | None
def __init__(
self,
cells: list[list[str]],
markdown: str,
page_number: int,
bounding_box: OcrTableBoundingBox | None = None,
) -> None: ...
class OcrTableBoundingBox:
left: int
top: int
right: int
bottom: int
def __init__(self, left: int, top: int, right: int, bottom: int) -> None: ...
class ImagePreprocessingConfig:
target_dpi: int
auto_rotate: bool
deskew: bool
denoise: bool
contrast_enhance: bool
binarization_method: str
invert_colors: bool
def __init__(
self,
target_dpi: int | None = None,
auto_rotate: bool | None = None,
deskew: bool | None = None,
denoise: bool | None = None,
contrast_enhance: bool | None = None,
binarization_method: str | None = None,
invert_colors: bool | None = None,
) -> None: ...
@staticmethod
def default() -> ImagePreprocessingConfig: ...
class TesseractConfig:
language: str
psm: int
output_format: str
oem: int
min_confidence: float
preprocessing: ImagePreprocessingConfig | None
enable_table_detection: bool
table_min_confidence: float
table_column_threshold: int
table_row_threshold_ratio: float
use_cache: bool
classify_use_pre_adapted_templates: bool
language_model_ngram_on: bool
tessedit_dont_blkrej_good_wds: bool
tessedit_dont_rowrej_good_wds: bool
tessedit_enable_dict_correction: bool
tessedit_char_whitelist: str
tessedit_char_blacklist: str
tessedit_use_primary_params_model: bool
textord_space_size_is_variable: bool
thresholding_method: bool
def __init__(
self,
language: str | None = None,
psm: int | None = None,
output_format: str | None = None,
oem: int | None = None,
min_confidence: float | None = None,
preprocessing: ImagePreprocessingConfig | None = None,
enable_table_detection: bool | None = None,
table_min_confidence: float | None = None,
table_column_threshold: int | None = None,
table_row_threshold_ratio: float | None = None,
use_cache: bool | None = None,
classify_use_pre_adapted_templates: bool | None = None,
language_model_ngram_on: bool | None = None,
tessedit_dont_blkrej_good_wds: bool | None = None,
tessedit_dont_rowrej_good_wds: bool | None = None,
tessedit_enable_dict_correction: bool | None = None,
tessedit_char_whitelist: str | None = None,
tessedit_char_blacklist: str | None = None,
tessedit_use_primary_params_model: bool | None = None,
textord_space_size_is_variable: bool | None = None,
thresholding_method: bool | None = None,
) -> None: ...
@staticmethod
def default() -> TesseractConfig: ...
class ImagePreprocessingMetadata:
original_dimensions: list[int]
original_dpi: list[float]
target_dpi: int
scale_factor: float
auto_adjusted: bool
final_dpi: int
new_dimensions: list[int] | None
resample_method: str
dimension_clamped: bool
calculated_dpi: int | None
skipped_resize: bool
resize_error: str | None
def __init__(
self,
original_dimensions: list[int],
original_dpi: list[float],
target_dpi: int,
scale_factor: float,
auto_adjusted: bool,
final_dpi: int,
resample_method: str,
dimension_clamped: bool,
skipped_resize: bool,
new_dimensions: list[int] | None = None,
calculated_dpi: int | None = None,
resize_error: str | None = None,
) -> None: ...
class Metadata:
title: str | None
subject: str | None
authors: list[str] | None
keywords: list[str] | None
language: str | None
created_at: str | None
modified_at: str | None
created_by: str | None
modified_by: str | None
pages: PageStructure | None
format: FormatMetadata | None
image_preprocessing: ImagePreprocessingMetadata | None
json_schema: dict[str, Any] | None
error: ErrorMetadata | None
extraction_duration_ms: int | None
category: str | None
tags: list[str] | None
document_version: str | None
abstract_text: str | None
output_format: str | None
ocr_used: bool
additional: dict[str, dict[str, Any]]
def __init__(
self,
title: str | None = None,
subject: str | None = None,
authors: list[str] | None = None,
keywords: list[str] | None = None,
language: str | None = None,
created_at: str | None = None,
modified_at: str | None = None,
created_by: str | None = None,
modified_by: str | None = None,
pages: PageStructure | None = None,
format: FormatMetadata | None = None, # noqa: A002
image_preprocessing: ImagePreprocessingMetadata | None = None,
json_schema: dict[str, Any] | None = None,
error: ErrorMetadata | None = None,
extraction_duration_ms: int | None = None,
category: str | None = None,
tags: list[str] | None = None,
document_version: str | None = None,
abstract_text: str | None = None,
output_format: str | None = None,
ocr_used: bool | None = None,
additional: dict[str, dict[str, Any]] | None = None,
) -> None: ...
def is_empty(self) -> bool: ...
class ExcelMetadata:
sheet_count: int | None
sheet_names: list[str] | None
def __init__(
self,
sheet_count: int | None = None,
sheet_names: list[str] | None = None,
) -> None: ...
class EmailMetadata:
from_email: str | None
from_name: str | None
to_emails: list[str]
cc_emails: list[str]
bcc_emails: list[str]
message_id: str | None
attachments: list[str]
def __init__(
self,
from_email: str | None = None,
from_name: str | None = None,
to_emails: list[str] | None = None,
cc_emails: list[str] | None = None,
bcc_emails: list[str] | None = None,
message_id: str | None = None,
attachments: list[str] | None = None,
) -> None: ...
class ArchiveMetadata:
format: str
file_count: int
file_list: list[str]
total_size: int
compressed_size: int | None
def __init__(
self,
format: str | None = None, # noqa: A002
file_count: int | None = None,
file_list: list[str] | None = None,
total_size: int | None = None,
compressed_size: int | None = None,
) -> None: ...
class ImageMetadata:
width: int
height: int
format: str
exif: dict[str, str]
def __init__(
self,
width: int | None = None,
height: int | None = None,
format: str | None = None, # noqa: A002
exif: dict[str, str] | None = None,
) -> None: ...
class XmlMetadata:
element_count: int
unique_elements: list[str]
def __init__(
self,
element_count: int | None = None,
unique_elements: list[str] | None = None,
) -> None: ...
class TextMetadata:
line_count: int
word_count: int
character_count: int
headers: list[str] | None
links: list[list[str]] | None
code_blocks: list[list[str]] | None
def __init__(
self,
line_count: int | None = None,
word_count: int | None = None,
character_count: int | None = None,
headers: list[str] | None = None,
links: list[list[str]] | None = None,
code_blocks: list[list[str]] | None = None,
) -> None: ...
class HeaderMetadata:
level: int
text: str
id: str | None
depth: int
html_offset: int
def __init__(
self,
level: int,
text: str,
depth: int,
html_offset: int,
id: str | None = None, # noqa: A002
) -> None: ...
class LinkMetadata:
href: str
text: str
title: str | None
link_type: LinkType
rel: list[str]
attributes: list[list[str]]
def __init__(
self,
href: str,
text: str,
link_type: LinkType | str,
rel: list[str],
attributes: list[list[str]],
title: str | None = None,
) -> None: ...
class ImageMetadataType:
src: str
alt: str | None
title: str | None
dimensions: list[int] | None
image_type: ImageType
attributes: list[list[str]]
def __init__(
self,
src: str,
image_type: ImageType | str,
attributes: list[list[str]],
alt: str | None = None,
title: str | None = None,
dimensions: list[int] | None = None,
) -> None: ...
class StructuredData:
data_type: StructuredDataType
raw_json: str
schema_type: str | None
def __init__(
self,
data_type: StructuredDataType | str,
raw_json: str,
schema_type: str | None = None,
) -> None: ...
class HtmlMetadata:
title: str | None
description: str | None
keywords: list[str]
author: str | None
canonical_url: str | None
base_href: str | None
language: str | None
text_direction: TextDirection | None
open_graph: dict[str, str]
twitter_card: dict[str, str]
meta_tags: dict[str, str]
headers: list[HeaderMetadata]
links: list[LinkMetadata]
images: list[ImageMetadataType]
structured_data: list[StructuredData]
def __init__(
self,
title: str | None = None,
description: str | None = None,
keywords: list[str] | None = None,
author: str | None = None,
canonical_url: str | None = None,
base_href: str | None = None,
language: str | None = None,
text_direction: TextDirection | str | None = None,
open_graph: dict[str, str] | None = None,
twitter_card: dict[str, str] | None = None,
meta_tags: dict[str, str] | None = None,
headers: list[HeaderMetadata] | None = None,
links: list[LinkMetadata] | None = None,
images: list[ImageMetadataType] | None = None,
structured_data: list[StructuredData] | None = None,
) -> None: ...
class OcrMetadata:
language: str
psm: int
output_format: str
table_count: int
table_rows: int | None
table_cols: int | None
def __init__(
self,
language: str | None = None,
psm: int | None = None,
output_format: str | None = None,
table_count: int | None = None,
table_rows: int | None = None,
table_cols: int | None = None,
) -> None: ...
class ErrorMetadata:
error_type: str
message: str
def __init__(self, error_type: str, message: str) -> None: ...
class PptxMetadata:
slide_count: int
slide_names: list[str]
image_count: int | None
table_count: int | None
def __init__(
self,
slide_count: int | None = None,
slide_names: list[str] | None = None,
image_count: int | None = None,
table_count: int | None = None,
) -> None: ...
class DocxMetadata:
core_properties: CoreProperties | None
app_properties: DocxAppProperties | None
custom_properties: dict[str, dict[str, Any]] | None
def __init__(
self,
core_properties: CoreProperties | None = None,
app_properties: DocxAppProperties | None = None,
custom_properties: dict[str, dict[str, Any]] | None = None,
) -> None: ...
class CsvMetadata:
row_count: int
column_count: int
delimiter: str | None
has_header: bool
column_types: list[str] | None
def __init__(
self,
row_count: int | None = None,
column_count: int | None = None,
delimiter: str | None = None,
has_header: bool | None = None,
column_types: list[str] | None = None,
) -> None: ...
class BibtexMetadata:
entry_count: int
citation_keys: list[str]
authors: list[str]
year_range: YearRange | None
entry_types: dict[str, int] | None
def __init__(
self,
entry_count: int | None = None,
citation_keys: list[str] | None = None,
authors: list[str] | None = None,
year_range: YearRange | None = None,
entry_types: dict[str, int] | None = None,
) -> None: ...
class CitationMetadata:
citation_count: int
format: str | None
authors: list[str]
year_range: YearRange | None
dois: list[str]
keywords: list[str]
def __init__(
self,
citation_count: int | None = None,
format: str | None = None, # noqa: A002
authors: list[str] | None = None,
year_range: YearRange | None = None,
dois: list[str] | None = None,
keywords: list[str] | None = None,
) -> None: ...
class YearRange:
min: int | None
max: int | None
years: list[int]
def __init__(
self,
years: list[int],
min: int | None = None, # noqa: A002
max: int | None = None, # noqa: A002
) -> None: ...
class FictionBookMetadata:
genres: list[str]
sequences: list[str]
annotation: str | None
def __init__(
self,
genres: list[str] | None = None,
sequences: list[str] | None = None,
annotation: str | None = None,
) -> None: ...
class DbfMetadata:
record_count: int
field_count: int
fields: list[DbfFieldInfo]
def __init__(
self,
record_count: int | None = None,
field_count: int | None = None,
fields: list[DbfFieldInfo] | None = None,
) -> None: ...
class DbfFieldInfo:
name: str
field_type: str
def __init__(self, name: str, field_type: str) -> None: ...
class JatsMetadata:
copyright: str | None
license: str | None
history_dates: dict[str, str]
contributor_roles: list[ContributorRole]
def __init__(
self,
copyright: str | None = None, # noqa: A002
license: str | None = None, # noqa: A002
history_dates: dict[str, str] | None = None,
contributor_roles: list[ContributorRole] | None = None,
) -> None: ...
class ContributorRole:
name: str
role: str | None
def __init__(self, name: str, role: str | None = None) -> None: ...
class EpubMetadata:
coverage: str | None
dc_format: str | None
relation: str | None
source: str | None
dc_type: str | None
cover_image: str | None
def __init__(
self,
coverage: str | None = None,
dc_format: str | None = None,
relation: str | None = None,
source: str | None = None,
dc_type: str | None = None,
cover_image: str | None = None,
) -> None: ...
class PstMetadata:
message_count: int
def __init__(self, message_count: int | None = None) -> None: ...
class OcrConfidence:
detection: float | None
recognition: float
def __init__(
self,
detection: float | None = None,
recognition: float | None = None,
) -> None: ...
class OcrRotation:
angle_degrees: float
confidence: float | None
def __init__(self, angle_degrees: float, confidence: float | None = None) -> None: ...
class OcrElement:
text: str
geometry: OcrBoundingGeometry
confidence: OcrConfidence
level: OcrElementLevel
rotation: OcrRotation | None
page_number: int
parent_id: str | None
backend_metadata: dict[str, dict[str, Any]]
def __init__(
self,
text: str | None = None,
geometry: OcrBoundingGeometry | None = None,
confidence: OcrConfidence | None = None,
level: OcrElementLevel | str | None = None,
rotation: OcrRotation | None = None,
page_number: int | None = None,
parent_id: str | None = None,
backend_metadata: dict[str, dict[str, Any]] | None = None,
) -> None: ...
class OcrElementConfig:
include_elements: bool
min_level: OcrElementLevel
min_confidence: float
build_hierarchy: bool
def __init__(
self,
include_elements: bool | None = None,
min_level: OcrElementLevel | str | None = None,
min_confidence: float | None = None,
build_hierarchy: bool | None = None,
) -> None: ...
class PageStructure:
total_count: int
unit_type: PageUnitType
boundaries: list[PageBoundary] | None
pages: list[PageInfo] | None
def __init__(
self,
total_count: int,
unit_type: PageUnitType | str,
boundaries: list[PageBoundary] | None = None,
pages: list[PageInfo] | None = None,
) -> None: ...
class PageBoundary:
byte_start: int
byte_end: int
page_number: int
def __init__(self, byte_start: int, byte_end: int, page_number: int) -> None: ...
class PageInfo:
number: int
title: str | None
dimensions: list[float] | None
image_count: int | None
table_count: int | None
hidden: bool | None
is_blank: bool | None
has_vector_graphics: bool
def __init__(
self,
number: int,
has_vector_graphics: bool,
title: str | None = None,
dimensions: list[float] | None = None,
image_count: int | None = None,
table_count: int | None = None,
hidden: bool | None = None,
is_blank: bool | None = None,
) -> None: ...
class PageContent:
page_number: int
content: str
tables: list[Table]
image_indices: list[int]
hierarchy: PageHierarchy | None
is_blank: bool | None
layout_regions: list[LayoutRegion] | None
speaker_notes: str | None
section_name: str | None
sheet_name: str | None
def __init__(
self,
page_number: int,
content: str,
tables: list[Table],
image_indices: list[int],
hierarchy: PageHierarchy | None = None,
is_blank: bool | None = None,
layout_regions: list[LayoutRegion] | None = None,
speaker_notes: str | None = None,
section_name: str | None = None,
sheet_name: str | None = None,
) -> None: ...
class LayoutRegion:
class_name: str
confidence: float
bounding_box: BoundingBox
area_fraction: float
def __init__(
self,
class_name: str | None = None,
confidence: float | None = None,
bounding_box: BoundingBox | None = None,
area_fraction: float | None = None,
) -> None: ...
class PageHierarchy:
block_count: int
blocks: list[HierarchicalBlock]
def __init__(self, block_count: int, blocks: list[HierarchicalBlock]) -> None: ...
class HierarchicalBlock:
text: str
font_size: float
level: str
bbox: list[float] | None
def __init__(
self,
text: str,
font_size: float,
level: str,
bbox: list[float] | None = None,
) -> None: ...
class CellChange:
row: int
col: int
from_: str
to: str
def __init__(self, row: int, col: int, from_: str, to: str) -> None: ...
class DocumentRevision:
revision_id: str
author: str | None
timestamp: str | None
kind: RevisionKind
anchor: RevisionAnchor | None
delta: RevisionDelta
def __init__(
self,
revision_id: str,
kind: RevisionKind | str,
delta: RevisionDelta,
author: str | None = None,
timestamp: str | None = None,
anchor: RevisionAnchor | None = None,
) -> None: ...
class RevisionDelta:
content: list[DiffLine]
table_changes: list[CellChange]
def __init__(
self,
content: list[DiffLine] | None = None,
table_changes: list[CellChange] | None = None,
) -> None: ...
class Table:
cells: list[list[str]]
markdown: str
page_number: int
bounding_box: BoundingBox | None
def __init__(
self,
cells: list[list[str]] | None = None,
markdown: str | None = None,
page_number: int | None = None,
bounding_box: BoundingBox | None = None,
) -> None: ...
class TableCell:
content: str
row_span: int
col_span: int
is_header: bool
def __init__(
self,
content: str | None = None,
row_span: int | None = None,
col_span: int | None = None,
is_header: bool | None = None,
) -> None: ...
class ExtractedUri:
url: str
label: str | None
page: int | None
kind: UriKind
def __init__(
self,
url: str,
kind: UriKind | str,
label: str | None = None,
page: int | None = None,
) -> None: ...
class DetectResponse:
mime_type: str
filename: str | None
def __init__(self, mime_type: str, filename: str | None = None) -> None: ...
class DiffOptions:
include_metadata: bool
include_embedded: bool
max_content_chars: int | None
def __init__(
self,
include_metadata: bool | None = None,
include_embedded: bool | None = None,
max_content_chars: int | None = None,
) -> None: ...
@staticmethod
def default() -> DiffOptions: ...
class ExtractionDiff:
content_diff: list[DiffHunk]
tables_added: list[Table]
tables_removed: list[Table]
tables_changed: list[TableDiff]
metadata_changed: dict[str, Any]
embedded_changes: EmbeddedChanges
def __init__(
self,
content_diff: list[DiffHunk],
tables_added: list[Table],
tables_removed: list[Table],
tables_changed: list[TableDiff],
metadata_changed: dict[str, Any],
embedded_changes: EmbeddedChanges,
) -> None: ...
class DiffHunk:
from_line: int
from_count: int
to_line: int
to_count: int
lines: list[DiffLine]
def __init__(
self,
from_line: int,
from_count: int,
to_line: int,
to_count: int,
lines: list[DiffLine],
) -> None: ...
class TableDiff:
from_index: int
to_index: int
cell_changes: list[CellChange]
def __init__(self, from_index: int, to_index: int, cell_changes: list[CellChange]) -> None: ...
class EmbeddedChanges:
added: list[ArchiveEntry]
removed: list[ArchiveEntry]
changed: list[EmbeddedDiff]
def __init__(
self,
added: list[ArchiveEntry],
removed: list[ArchiveEntry],
changed: list[EmbeddedDiff],
) -> None: ...
class EmbeddedDiff:
path: str
diff: ExtractionDiff
def __init__(self, path: str, diff: ExtractionDiff) -> None: ...
class EmbeddingPreset:
name: str
chunk_size: int
overlap: int
model_repo: str
pooling: str
model_file: str
dimensions: int
description: str
def __init__(
self,
name: str,
chunk_size: int,
overlap: int,
model_repo: str,
pooling: str,
model_file: str,
dimensions: int,
description: str,
) -> None: ...
class YakeParams:
window_size: int
def __init__(self, window_size: int | None = None) -> None: ...
@staticmethod
def default() -> YakeParams: ...
class RakeParams:
min_word_length: int
max_words_per_phrase: int
def __init__(
self,
min_word_length: int | None = None,
max_words_per_phrase: int | None = None,
) -> None: ...
@staticmethod
def default() -> RakeParams: ...
class KeywordConfig:
algorithm: KeywordAlgorithm
max_keywords: int
min_score: float
ngram_range: list[int]
language: str | None
yake_params: YakeParams | None
rake_params: RakeParams | None
def __init__(
self,
algorithm: KeywordAlgorithm | str | None = None,
max_keywords: int | None = None,
min_score: float | None = None,
ngram_range: list[int] | None = None,
language: str | None = None,
) -> None: ...
@staticmethod
def default() -> KeywordConfig: ...
class Keyword:
text: str
score: float
algorithm: KeywordAlgorithm
positions: list[int] | None
def __init__(
self,
text: str,
score: float,
algorithm: KeywordAlgorithm | str,
positions: list[int] | None = None,
) -> None: ...
class PaddleOcrConfig:
language: str
cache_dir: str | None
use_angle_cls: bool
enable_table_detection: bool
det_db_thresh: float
det_db_box_thresh: float
det_db_unclip_ratio: float
det_limit_side_len: int
rec_batch_num: int
padding: int
drop_score: float
model_tier: str
def __init__(
self,
language: str | None = None,
cache_dir: str | None = None,
use_angle_cls: bool | None = None,
enable_table_detection: bool | None = None,
det_db_thresh: float | None = None,
det_db_box_thresh: float | None = None,
det_db_unclip_ratio: float | None = None,
det_limit_side_len: int | None = None,
rec_batch_num: int | None = None,
padding: int | None = None,
drop_score: float | None = None,
model_tier: str | None = None,
) -> None: ...
def with_cache_dir(self, path: str) -> PaddleOcrConfig: ...
def with_table_detection(self, enable: bool) -> PaddleOcrConfig: ...
def with_angle_cls(self, enable: bool) -> PaddleOcrConfig: ...
def with_det_db_thresh(self, threshold: float) -> PaddleOcrConfig: ...
def with_det_db_box_thresh(self, threshold: float) -> PaddleOcrConfig: ...
def with_det_db_unclip_ratio(self, ratio: float) -> PaddleOcrConfig: ...
def with_det_limit_side_len(self, length: int) -> PaddleOcrConfig: ...
def with_rec_batch_num(self, batch_size: int) -> PaddleOcrConfig: ...
def with_drop_score(self, score: float) -> PaddleOcrConfig: ...
def with_padding(self, padding: int) -> PaddleOcrConfig: ...
def with_model_tier(self, tier: str) -> PaddleOcrConfig: ...
@staticmethod
def default() -> PaddleOcrConfig: ...
class ModelPaths:
det_model: str
cls_model: str
rec_model: str
dict_file: str
def __init__(self, det_model: str, cls_model: str, rec_model: str, dict_file: str) -> None: ...
class OrientationResult:
degrees: int
confidence: float
def __init__(self, degrees: int, confidence: float) -> None: ...
class BBox:
x1: float
y1: float
x2: float
y2: float
def __init__(self, x1: float, y1: float, x2: float, y2: float) -> None: ...
class LayoutDetection:
class_name: LayoutClass
confidence: float
bbox: BBox
def __init__(self, class_name: LayoutClass | str, confidence: float, bbox: BBox) -> None: ...
class RecognizedTable:
detection_bbox: BBox
cells: list[list[str]]
markdown: str
def __init__(self, detection_bbox: BBox, cells: list[list[str]], markdown: str) -> None: ...
class DetectionResult:
page_width: int
page_height: int
detections: list[LayoutDetection]
def __init__(
self,
page_width: int,
page_height: int,
detections: list[LayoutDetection],
) -> None: ...
class EmbeddedFile:
name: str
data: bytes
compressed_size: int
mime_type: str | None
def __init__(
self,
name: str,
data: bytes,
compressed_size: int,
mime_type: str | None = None,
) -> None: ...
class PdfMetadata:
pdf_version: str | None
producer: str | None
is_encrypted: bool | None
width: int | None
height: int | None
page_count: int | None
def __init__(
self,
pdf_version: str | None = None,
producer: str | None = None,
is_encrypted: bool | None = None,
width: int | None = None,
height: int | None = None,
page_count: int | None = None,
) -> None: ...
class ExecutionProviderType:
auto: ExecutionProviderType = ...
cpu: ExecutionProviderType = ...
core_ml: ExecutionProviderType = ...
cuda: ExecutionProviderType = ...
tensor_rt: ExecutionProviderType = ...
def __init__(self, value: int | str) -> None: ...
class OutputFormatPlainVariant(TypedDict):
type: Literal["plain"]
class OutputFormatMarkdownVariant(TypedDict):
type: Literal["markdown"]
class OutputFormatDjotVariant(TypedDict):
type: Literal["djot"]
class OutputFormatHtmlVariant(TypedDict):
type: Literal["html"]
class OutputFormatJsonVariant(TypedDict):
type: Literal["json"]
class OutputFormatStructuredVariant(TypedDict):
type: Literal["structured"]
class OutputFormatCustomVariant(TypedDict):
type: Literal["custom"]
_0: str
class OutputFormat:
type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class HtmlTheme:
default: HtmlTheme = ...
git_hub: HtmlTheme = ...
dark: HtmlTheme = ...
light: HtmlTheme = ...
unstyled: HtmlTheme = ...
def __init__(self, value: int | str) -> None: ...
class TableModel:
tatr: TableModel = ...
slanet_wired: TableModel = ...
slanet_wireless: TableModel = ...
slanet_plus: TableModel = ...
slanet_auto: TableModel = ...
disabled: TableModel = ...
def __init__(self, value: int | str) -> None: ...
class ChunkerType:
text: ChunkerType = ...
markdown: ChunkerType = ...
yaml: ChunkerType = ...
semantic: ChunkerType = ...
def __init__(self, value: int | str) -> None: ...
class ChunkSizingCharactersVariant(TypedDict):
type: Literal["characters"]
class ChunkSizingTokenizerVariant(TypedDict):
type: Literal["tokenizer"]
model: str
cache_dir: str | None
class ChunkSizing:
type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class EmbeddingModelTypePresetVariant(TypedDict):
type: Literal["preset"]
name: str
class EmbeddingModelTypeCustomVariant(TypedDict):
type: Literal["custom"]
model_id: str
dimensions: int
class EmbeddingModelTypeLlmVariant(TypedDict):
type: Literal["llm"]
llm: LlmConfig
class EmbeddingModelTypePluginVariant(TypedDict):
type: Literal["plugin"]
name: str
class EmbeddingModelType:
type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class CodeContentMode:
chunks: CodeContentMode = ...
raw: CodeContentMode = ...
structure: CodeContentMode = ...
def __init__(self, value: int | str) -> None: ...
class ListType:
bullet: ListType = ...
numbered: ListType = ...
lettered: ListType = ...
indented: ListType = ...
def __init__(self, value: int | str) -> None: ...
class OcrBackendType:
tesseract: OcrBackendType = ...
easy_ocr: OcrBackendType = ...
paddle_ocr: OcrBackendType = ...
custom: OcrBackendType = ...
def __init__(self, value: int | str) -> None: ...
class ProcessingStage:
early: ProcessingStage = ...
middle: ProcessingStage = ...
late: ProcessingStage = ...
def __init__(self, value: int | str) -> None: ...
class ReductionLevel:
off: ReductionLevel = ...
light: ReductionLevel = ...
moderate: ReductionLevel = ...
aggressive: ReductionLevel = ...
maximum: ReductionLevel = ...
def __init__(self, value: int | str) -> None: ...
class PdfAnnotationType:
text: PdfAnnotationType = ...
highlight: PdfAnnotationType = ...
link: PdfAnnotationType = ...
stamp: PdfAnnotationType = ...
underline: PdfAnnotationType = ...
strike_out: PdfAnnotationType = ...
other: PdfAnnotationType = ...
def __init__(self, value: int | str) -> None: ...
class BlockType:
paragraph: BlockType = ...
heading: BlockType = ...
blockquote: BlockType = ...
code_block: BlockType = ...
list_item: BlockType = ...
ordered_list: BlockType = ...
bullet_list: BlockType = ...
task_list: BlockType = ...
definition_list: BlockType = ...
definition_term: BlockType = ...
definition_description: BlockType = ...
div: BlockType = ...
section: BlockType = ...
thematic_break: BlockType = ...
raw_block: BlockType = ...
math_display: BlockType = ...
def __init__(self, value: int | str) -> None: ...
class InlineType:
text: InlineType = ...
strong: InlineType = ...
emphasis: InlineType = ...
highlight: InlineType = ...
subscript: InlineType = ...
superscript: InlineType = ...
insert: InlineType = ...
delete: InlineType = ...
code: InlineType = ...
link: InlineType = ...
image: InlineType = ...
span: InlineType = ...
math: InlineType = ...
raw_inline: InlineType = ...
footnote_ref: InlineType = ...
symbol: InlineType = ...
def __init__(self, value: int | str) -> None: ...
class RelationshipKind:
footnote_reference: RelationshipKind = ...
citation_reference: RelationshipKind = ...
internal_link: RelationshipKind = ...
caption: RelationshipKind = ...
label: RelationshipKind = ...
toc_entry: RelationshipKind = ...
cross_reference: RelationshipKind = ...
def __init__(self, value: int | str) -> None: ...
class ContentLayer:
body: ContentLayer = ...
header: ContentLayer = ...
footer: ContentLayer = ...
footnote: ContentLayer = ...
def __init__(self, value: int | str) -> None: ...
class NodeContentTitleVariant(TypedDict):
node_type: Literal["title"]
text: str
class NodeContentHeadingVariant(TypedDict):
node_type: Literal["heading"]
level: int
text: str
class NodeContentParagraphVariant(TypedDict):
node_type: Literal["paragraph"]
text: str
class NodeContentListVariant(TypedDict):
node_type: Literal["list"]
ordered: bool
class NodeContentListItemVariant(TypedDict):
node_type: Literal["list_item"]
text: str
class NodeContentTableVariant(TypedDict):
node_type: Literal["table"]
grid: TableGrid
class NodeContentImageVariant(TypedDict):
node_type: Literal["image"]
description: str | None
image_index: int | None
src: str | None
class NodeContentCodeVariant(TypedDict):
node_type: Literal["code"]
text: str
language: str | None
class NodeContentQuoteVariant(TypedDict):
node_type: Literal["quote"]
class NodeContentFormulaVariant(TypedDict):
node_type: Literal["formula"]
text: str
class NodeContentFootnoteVariant(TypedDict):
node_type: Literal["footnote"]
text: str
class NodeContentGroupVariant(TypedDict):
node_type: Literal["group"]
label: str | None
heading_level: int | None
heading_text: str | None
class NodeContentPageBreakVariant(TypedDict):
node_type: Literal["page_break"]
class NodeContentSlideVariant(TypedDict):
node_type: Literal["slide"]
number: int
title: str | None
class NodeContentDefinitionListVariant(TypedDict):
node_type: Literal["definition_list"]
class NodeContentDefinitionItemVariant(TypedDict):
node_type: Literal["definition_item"]
term: str
definition: str
class NodeContentCitationVariant(TypedDict):
node_type: Literal["citation"]
key: str
text: str
class NodeContentAdmonitionVariant(TypedDict):
node_type: Literal["admonition"]
kind: str
title: str | None
class NodeContentRawBlockVariant(TypedDict):
node_type: Literal["raw_block"]
format: str
content: str
class NodeContentMetadataBlockVariant(TypedDict):
node_type: Literal["metadata_block"]
entries: list[list[str]]
class NodeContent:
node_type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class AnnotationKindBoldVariant(TypedDict):
annotation_type: Literal["bold"]
class AnnotationKindItalicVariant(TypedDict):
annotation_type: Literal["italic"]
class AnnotationKindUnderlineVariant(TypedDict):
annotation_type: Literal["underline"]
class AnnotationKindStrikethroughVariant(TypedDict):
annotation_type: Literal["strikethrough"]
class AnnotationKindCodeVariant(TypedDict):
annotation_type: Literal["code"]
class AnnotationKindSubscriptVariant(TypedDict):
annotation_type: Literal["subscript"]
class AnnotationKindSuperscriptVariant(TypedDict):
annotation_type: Literal["superscript"]
class AnnotationKindLinkVariant(TypedDict):
annotation_type: Literal["link"]
url: str
title: str | None
class AnnotationKindHighlightVariant(TypedDict):
annotation_type: Literal["highlight"]
class AnnotationKindColorVariant(TypedDict):
annotation_type: Literal["color"]
value: str
class AnnotationKindFontSizeVariant(TypedDict):
annotation_type: Literal["font_size"]
value: str
class AnnotationKindCustomVariant(TypedDict):
annotation_type: Literal["custom"]
name: str
value: str | None
class AnnotationKind:
annotation_type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class ExtractionMethod:
native: ExtractionMethod = ...
ocr: ExtractionMethod = ...
mixed: ExtractionMethod = ...
def __init__(self, value: int | str) -> None: ...
class ChunkType:
heading: ChunkType = ...
party_list: ChunkType = ...
definitions: ChunkType = ...
operative_clause: ChunkType = ...
signature_block: ChunkType = ...
schedule: ChunkType = ...
table_like: ChunkType = ...
formula: ChunkType = ...
code_block: ChunkType = ...
image: ChunkType = ...
org_chart: ChunkType = ...
diagram: ChunkType = ...
unknown: ChunkType = ...
def __init__(self, value: int | str) -> None: ...
class ImageKind:
photograph: ImageKind = ...
diagram: ImageKind = ...
chart: ImageKind = ...
drawing: ImageKind = ...
text_block: ImageKind = ...
decoration: ImageKind = ...
logo: ImageKind = ...
icon: ImageKind = ...
tile_fragment: ImageKind = ...
mask: ImageKind = ...
page_raster: ImageKind = ...
unknown: ImageKind = ...
def __init__(self, value: int | str) -> None: ...
class ResultFormat:
unified: ResultFormat = ...
element_based: ResultFormat = ...
def __init__(self, value: int | str) -> None: ...
class ElementType:
title_: ElementType = ...
narrative_text: ElementType = ...
heading: ElementType = ...
list_item: ElementType = ...
table: ElementType = ...
image: ElementType = ...
page_break: ElementType = ...
code_block: ElementType = ...
block_quote: ElementType = ...
footer: ElementType = ...
header: ElementType = ...
def __init__(self, value: int | str) -> None: ...
class FormatMetadataPdfVariant(TypedDict):
format_type: Literal["pdf"]
_0: PdfMetadata
class FormatMetadataDocxVariant(TypedDict):
format_type: Literal["docx"]
_0: DocxMetadata
class FormatMetadataExcelVariant(TypedDict):
format_type: Literal["excel"]
_0: ExcelMetadata
class FormatMetadataEmailVariant(TypedDict):
format_type: Literal["email"]
_0: EmailMetadata
class FormatMetadataPptxVariant(TypedDict):
format_type: Literal["pptx"]
_0: PptxMetadata
class FormatMetadataArchiveVariant(TypedDict):
format_type: Literal["archive"]
_0: ArchiveMetadata
class FormatMetadataImageVariant(TypedDict):
format_type: Literal["image"]
_0: ImageMetadata
class FormatMetadataXmlVariant(TypedDict):
format_type: Literal["xml"]
_0: XmlMetadata
class FormatMetadataTextVariant(TypedDict):
format_type: Literal["text"]
_0: TextMetadata
class FormatMetadataHtmlVariant(TypedDict):
format_type: Literal["html"]
_0: HtmlMetadata
class FormatMetadataOcrVariant(TypedDict):
format_type: Literal["ocr"]
_0: OcrMetadata
class FormatMetadataCsvVariant(TypedDict):
format_type: Literal["csv"]
_0: CsvMetadata
class FormatMetadataBibtexVariant(TypedDict):
format_type: Literal["bibtex"]
_0: BibtexMetadata
class FormatMetadataCitationVariant(TypedDict):
format_type: Literal["citation"]
_0: CitationMetadata
class FormatMetadataFictionBookVariant(TypedDict):
format_type: Literal["fiction_book"]
_0: FictionBookMetadata
class FormatMetadataDbfVariant(TypedDict):
format_type: Literal["dbf"]
_0: DbfMetadata
class FormatMetadataJatsVariant(TypedDict):
format_type: Literal["jats"]
_0: JatsMetadata
class FormatMetadataEpubVariant(TypedDict):
format_type: Literal["epub"]
_0: EpubMetadata
class FormatMetadataPstVariant(TypedDict):
format_type: Literal["pst"]
_0: PstMetadata
class FormatMetadataCodeVariant(TypedDict):
format_type: Literal["code"]
_0: str
class FormatMetadata:
format_type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class TextDirection:
left_to_right: TextDirection = ...
right_to_left: TextDirection = ...
auto: TextDirection = ...
def __init__(self, value: int | str) -> None: ...
class LinkType:
anchor: LinkType = ...
internal: LinkType = ...
external: LinkType = ...
email: LinkType = ...
phone: LinkType = ...
other: LinkType = ...
def __init__(self, value: int | str) -> None: ...
class ImageType:
data_uri: ImageType = ...
inline_svg: ImageType = ...
external: ImageType = ...
relative: ImageType = ...
def __init__(self, value: int | str) -> None: ...
class StructuredDataType:
json_ld: StructuredDataType = ...
microdata: StructuredDataType = ...
rd_fa: StructuredDataType = ...
def __init__(self, value: int | str) -> None: ...
class OcrBoundingGeometryRectangleVariant(TypedDict):
type: Literal["rectangle"]
left: int
top: int
width: int
height: int
class OcrBoundingGeometryQuadrilateralVariant(TypedDict):
type: Literal["quadrilateral"]
points: str
class OcrBoundingGeometry:
type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class OcrElementLevel:
word: OcrElementLevel = ...
line: OcrElementLevel = ...
block: OcrElementLevel = ...
page: OcrElementLevel = ...
def __init__(self, value: int | str) -> None: ...
class PageUnitType:
page: PageUnitType = ...
slide: PageUnitType = ...
sheet: PageUnitType = ...
def __init__(self, value: int | str) -> None: ...
class DiffLineContextVariant(TypedDict):
kind: Literal["context"]
_0: str
class DiffLineAddedVariant(TypedDict):
kind: Literal["added"]
_0: str
class DiffLineRemovedVariant(TypedDict):
kind: Literal["removed"]
_0: str
class DiffLine:
kind: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class RevisionKind:
insertion: RevisionKind = ...
deletion: RevisionKind = ...
format_change: RevisionKind = ...
comment: RevisionKind = ...
def __init__(self, value: int | str) -> None: ...
class RevisionAnchorParagraphVariant(TypedDict):
type: Literal["paragraph"]
index: int
class RevisionAnchorTableCellVariant(TypedDict):
type: Literal["table_cell"]
row: int
col: int
table_index: int
class RevisionAnchorPageVariant(TypedDict):
type: Literal["page"]
index: int
class RevisionAnchorSlideVariant(TypedDict):
type: Literal["slide"]
index: int
class RevisionAnchorSheetVariant(TypedDict):
type: Literal["sheet"]
index: int
name: str | None
class RevisionAnchor:
type: str
def __str__(self) -> str: ... # noqa: PYI029
def __repr__(self) -> str: ... # noqa: PYI029
class UriKind:
hyperlink: UriKind = ...
image: UriKind = ...
anchor: UriKind = ...
citation: UriKind = ...
reference: UriKind = ...
email: UriKind = ...
def __init__(self, value: int | str) -> None: ...
class KeywordAlgorithm:
yake: KeywordAlgorithm = ...
rake: KeywordAlgorithm = ...
def __init__(self, value: int | str) -> None: ...
class PSMMode:
osd_only: PSMMode = ...
auto_osd: PSMMode = ...
auto_only: PSMMode = ...
auto: PSMMode = ...
single_column: PSMMode = ...
single_block_vertical: PSMMode = ...
single_block: PSMMode = ...
single_line: PSMMode = ...
single_word: PSMMode = ...
circle_word: PSMMode = ...
single_char: PSMMode = ...
def __init__(self, value: int | str) -> None: ...
class PaddleLanguage:
english: PaddleLanguage = ...
chinese: PaddleLanguage = ...
japanese: PaddleLanguage = ...
korean: PaddleLanguage = ...
german: PaddleLanguage = ...
french: PaddleLanguage = ...
latin: PaddleLanguage = ...
cyrillic: PaddleLanguage = ...
traditional_chinese: PaddleLanguage = ...
thai: PaddleLanguage = ...
greek: PaddleLanguage = ...
east_slavic: PaddleLanguage = ...
arabic: PaddleLanguage = ...
devanagari: PaddleLanguage = ...
tamil: PaddleLanguage = ...
telugu: PaddleLanguage = ...
def __init__(self, value: int | str) -> None: ...
class LayoutClass:
caption: LayoutClass = ...
footnote: LayoutClass = ...
formula: LayoutClass = ...
list_item: LayoutClass = ...
page_footer: LayoutClass = ...
page_header: LayoutClass = ...
picture: LayoutClass = ...
section_header: LayoutClass = ...
table: LayoutClass = ...
text: LayoutClass = ...
title_: LayoutClass = ...
document_index: LayoutClass = ...
code: LayoutClass = ...
checkbox_selected: LayoutClass = ...
checkbox_unselected: LayoutClass = ...
form: LayoutClass = ...
key_value_region: LayoutClass = ...
def __init__(self, value: int | str) -> None: ...
async def extract_bytes(
content: bytes,
mime_type: str,
config: ExtractionConfig,
) -> ExtractionResult: ...
async def extract_file(
path: str,
config: ExtractionConfig,
mime_type: str | None = None,
) -> ExtractionResult: ...
def extract_file_sync(
path: str,
config: ExtractionConfig,
mime_type: str | None = None,
) -> ExtractionResult: ...
def extract_bytes_sync(
content: bytes,
mime_type: str,
config: ExtractionConfig,
) -> ExtractionResult: ...
def batch_extract_files_sync(
items: list[BatchFileItem],
config: ExtractionConfig,
) -> list[ExtractionResult]: ...
def batch_extract_bytes_sync(
items: list[BatchBytesItem],
config: ExtractionConfig,
) -> list[ExtractionResult]: ...
async def batch_extract_files(
items: list[BatchFileItem],
config: ExtractionConfig,
) -> list[ExtractionResult]: ...
async def batch_extract_bytes(
items: list[BatchBytesItem],
config: ExtractionConfig,
) -> list[ExtractionResult]: ...
def detect_mime_type_from_bytes(content: bytes) -> str: ...
def get_extensions_for_mime(mime_type: str) -> list[str]: ...
def clear_embedding_backends() -> None: ...
def list_embedding_backends() -> list[str]: ...
def list_document_extractors() -> list[str]: ...
def clear_document_extractors() -> None: ...
def list_ocr_backends() -> list[str]: ...
def clear_ocr_backends() -> None: ...
def list_post_processors() -> list[str]: ...
def clear_post_processors() -> None: ...
def list_renderers() -> list[str]: ...
def clear_renderers() -> None: ...
def list_validators() -> list[str]: ...
def clear_validators() -> None: ...
def compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) -> ExtractionDiff: ...
async def embed_texts_async(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
def render_pdf_page_to_png(
pdf_bytes: bytes,
page_index: int,
dpi: int | None = None,
password: str | None = None,
) -> bytes: ...
def detect_mime_type(path: str, check_exists: bool) -> str: ...
def embed_texts(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
def get_embedding_preset(name: str) -> EmbeddingPreset | None: ...
def list_embedding_presets() -> list[str]: ...
def register_ocr_backend(backend: object) -> None: ...
def unregister_ocr_backend(name: str) -> None: ...
def clear_ocr_backends() -> None: ...
def register_post_processor(backend: object) -> None: ...
def unregister_post_processor(name: str) -> None: ...
def clear_post_processors() -> None: ...
def register_validator(backend: object) -> None: ...
def unregister_validator(name: str) -> None: ...
def clear_validators() -> None: ...
def register_embedding_backend(backend: object) -> None: ...
def unregister_embedding_backend(name: str) -> None: ...
def clear_embedding_backends() -> None: ...
def register_document_extractor(backend: object) -> None: ...
def unregister_document_extractor(name: str) -> None: ...
def clear_document_extractors() -> None: ...
def register_renderer(backend: object) -> None: ...
def unregister_renderer(name: str) -> None: ...
def clear_renderers() -> None: ...