3058 lines
87 KiB
Python
3058 lines
87 KiB
Python
|
|
# This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
# To regenerate: alef generate
|
||
|
|
# To verify freshness: alef verify --exit-code
|
||
|
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
|
||
|
|
from typing import Any, Literal, TypedDict
|
||
|
|
|
||
|
|
class CacheStats:
|
||
|
|
total_files: int
|
||
|
|
total_size_mb: float
|
||
|
|
available_space_mb: float
|
||
|
|
oldest_file_age_days: float
|
||
|
|
newest_file_age_days: float
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
total_files: int,
|
||
|
|
total_size_mb: float,
|
||
|
|
available_space_mb: float,
|
||
|
|
oldest_file_age_days: float,
|
||
|
|
newest_file_age_days: float,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class AccelerationConfig:
|
||
|
|
provider: ExecutionProviderType
|
||
|
|
device_id: int
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
provider: ExecutionProviderType | str | None = None,
|
||
|
|
device_id: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ContentFilterConfig:
|
||
|
|
include_headers: bool
|
||
|
|
include_footers: bool
|
||
|
|
strip_repeating_text: bool
|
||
|
|
include_watermarks: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
include_headers: bool | None = None,
|
||
|
|
include_footers: bool | None = None,
|
||
|
|
strip_repeating_text: bool | None = None,
|
||
|
|
include_watermarks: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ContentFilterConfig: ...
|
||
|
|
|
||
|
|
class EmailConfig:
|
||
|
|
msg_fallback_codepage: int | None
|
||
|
|
def __init__(self, msg_fallback_codepage: int | None = None) -> None: ...
|
||
|
|
|
||
|
|
class ExtractionConfig:
|
||
|
|
use_cache: bool
|
||
|
|
enable_quality_processing: bool
|
||
|
|
ocr: OcrConfig | None
|
||
|
|
force_ocr: bool
|
||
|
|
force_ocr_pages: list[int] | None
|
||
|
|
disable_ocr: bool
|
||
|
|
chunking: ChunkingConfig | None
|
||
|
|
content_filter: ContentFilterConfig | None
|
||
|
|
images: ImageExtractionConfig | None
|
||
|
|
pdf_options: PdfConfig | None
|
||
|
|
token_reduction: TokenReductionOptions | None
|
||
|
|
language_detection: LanguageDetectionConfig | None
|
||
|
|
pages: PageConfig | None
|
||
|
|
keywords: KeywordConfig | None
|
||
|
|
postprocessor: PostProcessorConfig | None
|
||
|
|
html_options: str | None
|
||
|
|
html_output: HtmlOutputConfig | None
|
||
|
|
extraction_timeout_secs: int | None
|
||
|
|
max_concurrent_extractions: int | None
|
||
|
|
result_format: ResultFormat
|
||
|
|
security_limits: SecurityLimits | None
|
||
|
|
max_embedded_file_bytes: int | None
|
||
|
|
output_format: OutputFormat
|
||
|
|
layout: LayoutDetectionConfig | None
|
||
|
|
use_layout_for_markdown: bool
|
||
|
|
include_document_structure: bool
|
||
|
|
acceleration: AccelerationConfig | None
|
||
|
|
cache_namespace: str | None
|
||
|
|
cache_ttl_secs: int | None
|
||
|
|
email: EmailConfig | None
|
||
|
|
concurrency: str | None
|
||
|
|
max_archive_depth: int
|
||
|
|
tree_sitter: TreeSitterConfig | None
|
||
|
|
structured_extraction: StructuredExtractionConfig | None
|
||
|
|
cancel_token: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
use_cache: bool | None = None,
|
||
|
|
enable_quality_processing: bool | None = None,
|
||
|
|
ocr: OcrConfig | None = None,
|
||
|
|
force_ocr: bool | None = None,
|
||
|
|
force_ocr_pages: list[int] | None = None,
|
||
|
|
disable_ocr: bool | None = None,
|
||
|
|
chunking: ChunkingConfig | None = None,
|
||
|
|
content_filter: ContentFilterConfig | None = None,
|
||
|
|
images: ImageExtractionConfig | None = None,
|
||
|
|
token_reduction: TokenReductionOptions | None = None,
|
||
|
|
language_detection: LanguageDetectionConfig | None = None,
|
||
|
|
pages: PageConfig | None = None,
|
||
|
|
postprocessor: PostProcessorConfig | None = None,
|
||
|
|
extraction_timeout_secs: int | None = None,
|
||
|
|
max_concurrent_extractions: int | None = None,
|
||
|
|
result_format: ResultFormat | str | None = None,
|
||
|
|
security_limits: SecurityLimits | None = None,
|
||
|
|
max_embedded_file_bytes: int | None = None,
|
||
|
|
output_format: OutputFormat | None = None,
|
||
|
|
use_layout_for_markdown: bool | None = None,
|
||
|
|
include_document_structure: bool | None = None,
|
||
|
|
acceleration: AccelerationConfig | None = None,
|
||
|
|
cache_namespace: str | None = None,
|
||
|
|
cache_ttl_secs: int | None = None,
|
||
|
|
email: EmailConfig | None = None,
|
||
|
|
concurrency: str | None = None,
|
||
|
|
max_archive_depth: int | None = None,
|
||
|
|
structured_extraction: StructuredExtractionConfig | None = None,
|
||
|
|
cancel_token: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
def needs_image_processing(self) -> bool: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ExtractionConfig: ...
|
||
|
|
|
||
|
|
class FileExtractionConfig:
|
||
|
|
enable_quality_processing: bool | None
|
||
|
|
ocr: OcrConfig | None
|
||
|
|
force_ocr: bool | None
|
||
|
|
force_ocr_pages: list[int] | None
|
||
|
|
disable_ocr: bool | None
|
||
|
|
chunking: ChunkingConfig | None
|
||
|
|
content_filter: ContentFilterConfig | None
|
||
|
|
images: ImageExtractionConfig | None
|
||
|
|
pdf_options: PdfConfig | None
|
||
|
|
token_reduction: TokenReductionOptions | None
|
||
|
|
language_detection: LanguageDetectionConfig | None
|
||
|
|
pages: PageConfig | None
|
||
|
|
keywords: KeywordConfig | None
|
||
|
|
postprocessor: PostProcessorConfig | None
|
||
|
|
html_options: str | None
|
||
|
|
result_format: ResultFormat | None
|
||
|
|
output_format: OutputFormat | None
|
||
|
|
include_document_structure: bool | None
|
||
|
|
layout: LayoutDetectionConfig | None
|
||
|
|
timeout_secs: int | None
|
||
|
|
tree_sitter: TreeSitterConfig | None
|
||
|
|
structured_extraction: StructuredExtractionConfig | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enable_quality_processing: bool | None = None,
|
||
|
|
ocr: OcrConfig | None = None,
|
||
|
|
force_ocr: bool | None = None,
|
||
|
|
force_ocr_pages: list[int] | None = None,
|
||
|
|
disable_ocr: bool | None = None,
|
||
|
|
chunking: ChunkingConfig | None = None,
|
||
|
|
content_filter: ContentFilterConfig | None = None,
|
||
|
|
images: ImageExtractionConfig | None = None,
|
||
|
|
token_reduction: TokenReductionOptions | None = None,
|
||
|
|
language_detection: LanguageDetectionConfig | None = None,
|
||
|
|
pages: PageConfig | None = None,
|
||
|
|
postprocessor: PostProcessorConfig | None = None,
|
||
|
|
result_format: ResultFormat | str | None = None,
|
||
|
|
output_format: OutputFormat | None = None,
|
||
|
|
include_document_structure: bool | None = None,
|
||
|
|
timeout_secs: int | None = None,
|
||
|
|
structured_extraction: StructuredExtractionConfig | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class BatchBytesItem:
|
||
|
|
content: bytes
|
||
|
|
mime_type: str
|
||
|
|
config: FileExtractionConfig | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: bytes,
|
||
|
|
mime_type: str,
|
||
|
|
config: FileExtractionConfig | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class BatchFileItem:
|
||
|
|
path: str
|
||
|
|
config: FileExtractionConfig | None
|
||
|
|
def __init__(self, path: str, config: FileExtractionConfig | None = None) -> None: ...
|
||
|
|
|
||
|
|
class ImageExtractionConfig:
|
||
|
|
extract_images: bool
|
||
|
|
target_dpi: int
|
||
|
|
max_image_dimension: int
|
||
|
|
inject_placeholders: bool
|
||
|
|
auto_adjust_dpi: bool
|
||
|
|
min_dpi: int
|
||
|
|
max_dpi: int
|
||
|
|
max_images_per_page: int | None
|
||
|
|
classify: bool
|
||
|
|
include_page_rasters: bool
|
||
|
|
run_ocr_on_images: bool
|
||
|
|
ocr_text_only: bool
|
||
|
|
append_ocr_text: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
extract_images: bool | None = None,
|
||
|
|
target_dpi: int | None = None,
|
||
|
|
max_image_dimension: int | None = None,
|
||
|
|
inject_placeholders: bool | None = None,
|
||
|
|
auto_adjust_dpi: bool | None = None,
|
||
|
|
min_dpi: int | None = None,
|
||
|
|
max_dpi: int | None = None,
|
||
|
|
max_images_per_page: int | None = None,
|
||
|
|
classify: bool | None = None,
|
||
|
|
include_page_rasters: bool | None = None,
|
||
|
|
run_ocr_on_images: bool | None = None,
|
||
|
|
ocr_text_only: bool | None = None,
|
||
|
|
append_ocr_text: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ImageExtractionConfig: ...
|
||
|
|
|
||
|
|
class TokenReductionOptions:
|
||
|
|
mode: str
|
||
|
|
preserve_important_words: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
mode: str | None = None,
|
||
|
|
preserve_important_words: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> TokenReductionOptions: ...
|
||
|
|
|
||
|
|
class LanguageDetectionConfig:
|
||
|
|
enabled: bool
|
||
|
|
min_confidence: float
|
||
|
|
detect_multiple: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enabled: bool | None = None,
|
||
|
|
min_confidence: float | None = None,
|
||
|
|
detect_multiple: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> LanguageDetectionConfig: ...
|
||
|
|
|
||
|
|
class HtmlOutputConfig:
|
||
|
|
css: str | None
|
||
|
|
css_file: str | None
|
||
|
|
theme: HtmlTheme
|
||
|
|
class_prefix: str
|
||
|
|
embed_css: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
css: str | None = None,
|
||
|
|
css_file: str | None = None,
|
||
|
|
theme: HtmlTheme | str | None = None,
|
||
|
|
class_prefix: str | None = None,
|
||
|
|
embed_css: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> HtmlOutputConfig: ...
|
||
|
|
|
||
|
|
class LayoutDetectionConfig:
|
||
|
|
confidence_threshold: float | None
|
||
|
|
apply_heuristics: bool
|
||
|
|
table_model: TableModel
|
||
|
|
acceleration: AccelerationConfig | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
confidence_threshold: float | None = None,
|
||
|
|
apply_heuristics: bool | None = None,
|
||
|
|
table_model: TableModel | str | None = None,
|
||
|
|
acceleration: AccelerationConfig | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> LayoutDetectionConfig: ...
|
||
|
|
|
||
|
|
class LlmConfig:
|
||
|
|
model: str
|
||
|
|
api_key: str | None
|
||
|
|
base_url: str | None
|
||
|
|
timeout_secs: int | None
|
||
|
|
max_retries: int | None
|
||
|
|
temperature: float | None
|
||
|
|
max_tokens: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
model: str | None = None,
|
||
|
|
api_key: str | None = None,
|
||
|
|
base_url: str | None = None,
|
||
|
|
timeout_secs: int | None = None,
|
||
|
|
max_retries: int | None = None,
|
||
|
|
temperature: float | None = None,
|
||
|
|
max_tokens: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class StructuredExtractionConfig:
|
||
|
|
schema: dict[str, Any]
|
||
|
|
schema_name: str
|
||
|
|
schema_description: str | None
|
||
|
|
strict: bool
|
||
|
|
prompt: str | None
|
||
|
|
llm: LlmConfig
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
schema: dict[str, Any],
|
||
|
|
schema_name: str,
|
||
|
|
strict: bool,
|
||
|
|
llm: LlmConfig,
|
||
|
|
schema_description: str | None = None,
|
||
|
|
prompt: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrQualityThresholds:
|
||
|
|
min_total_non_whitespace: int
|
||
|
|
min_non_whitespace_per_page: float
|
||
|
|
min_meaningful_word_len: int
|
||
|
|
min_meaningful_words: int
|
||
|
|
min_alnum_ratio: float
|
||
|
|
min_garbage_chars: int
|
||
|
|
max_fragmented_word_ratio: float
|
||
|
|
critical_fragmented_word_ratio: float
|
||
|
|
min_avg_word_length: float
|
||
|
|
min_words_for_avg_length_check: int
|
||
|
|
min_consecutive_repeat_ratio: float
|
||
|
|
min_words_for_repeat_check: int
|
||
|
|
substantive_min_chars: int
|
||
|
|
non_text_min_chars: int
|
||
|
|
alnum_ws_ratio_threshold: float
|
||
|
|
pipeline_min_quality: float
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
min_total_non_whitespace: int | None = None,
|
||
|
|
min_non_whitespace_per_page: float | None = None,
|
||
|
|
min_meaningful_word_len: int | None = None,
|
||
|
|
min_meaningful_words: int | None = None,
|
||
|
|
min_alnum_ratio: float | None = None,
|
||
|
|
min_garbage_chars: int | None = None,
|
||
|
|
max_fragmented_word_ratio: float | None = None,
|
||
|
|
critical_fragmented_word_ratio: float | None = None,
|
||
|
|
min_avg_word_length: float | None = None,
|
||
|
|
min_words_for_avg_length_check: int | None = None,
|
||
|
|
min_consecutive_repeat_ratio: float | None = None,
|
||
|
|
min_words_for_repeat_check: int | None = None,
|
||
|
|
substantive_min_chars: int | None = None,
|
||
|
|
non_text_min_chars: int | None = None,
|
||
|
|
alnum_ws_ratio_threshold: float | None = None,
|
||
|
|
pipeline_min_quality: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> OcrQualityThresholds: ...
|
||
|
|
|
||
|
|
class OcrPipelineStage:
|
||
|
|
backend: str
|
||
|
|
priority: int
|
||
|
|
language: str | None
|
||
|
|
tesseract_config: TesseractConfig | None
|
||
|
|
paddle_ocr_config: dict[str, Any] | None
|
||
|
|
vlm_config: LlmConfig | None
|
||
|
|
backend_options: dict[str, Any] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
backend: str,
|
||
|
|
priority: int,
|
||
|
|
language: str | None = None,
|
||
|
|
tesseract_config: TesseractConfig | None = None,
|
||
|
|
paddle_ocr_config: dict[str, Any] | None = None,
|
||
|
|
vlm_config: LlmConfig | None = None,
|
||
|
|
backend_options: dict[str, Any] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrPipelineConfig:
|
||
|
|
stages: list[OcrPipelineStage]
|
||
|
|
quality_thresholds: OcrQualityThresholds
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
stages: list[OcrPipelineStage],
|
||
|
|
quality_thresholds: OcrQualityThresholds,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrConfig:
|
||
|
|
enabled: bool
|
||
|
|
backend: str
|
||
|
|
language: str
|
||
|
|
tesseract_config: TesseractConfig | None
|
||
|
|
output_format: OutputFormat | None
|
||
|
|
paddle_ocr_config: dict[str, Any] | None
|
||
|
|
backend_options: dict[str, Any] | None
|
||
|
|
element_config: OcrElementConfig | None
|
||
|
|
quality_thresholds: OcrQualityThresholds | None
|
||
|
|
pipeline: OcrPipelineConfig | None
|
||
|
|
auto_rotate: bool
|
||
|
|
vlm_config: LlmConfig | None
|
||
|
|
vlm_prompt: str | None
|
||
|
|
acceleration: AccelerationConfig | None
|
||
|
|
tessdata_bytes: dict[str, bytes] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enabled: bool | None = None,
|
||
|
|
backend: str | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
tesseract_config: TesseractConfig | None = None,
|
||
|
|
output_format: OutputFormat | None = None,
|
||
|
|
paddle_ocr_config: dict[str, Any] | None = None,
|
||
|
|
backend_options: dict[str, Any] | None = None,
|
||
|
|
element_config: OcrElementConfig | None = None,
|
||
|
|
quality_thresholds: OcrQualityThresholds | None = None,
|
||
|
|
pipeline: OcrPipelineConfig | None = None,
|
||
|
|
auto_rotate: bool | None = None,
|
||
|
|
vlm_config: LlmConfig | None = None,
|
||
|
|
vlm_prompt: str | None = None,
|
||
|
|
acceleration: AccelerationConfig | None = None,
|
||
|
|
tessdata_bytes: dict[str, bytes] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> OcrConfig: ...
|
||
|
|
|
||
|
|
class PageConfig:
|
||
|
|
extract_pages: bool
|
||
|
|
insert_page_markers: bool
|
||
|
|
marker_format: str
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
extract_pages: bool | None = None,
|
||
|
|
insert_page_markers: bool | None = None,
|
||
|
|
marker_format: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> PageConfig: ...
|
||
|
|
|
||
|
|
class PdfConfig:
|
||
|
|
extract_images: bool
|
||
|
|
extract_tables: bool
|
||
|
|
passwords: list[str] | None
|
||
|
|
extract_metadata: bool
|
||
|
|
hierarchy: HierarchyConfig | None
|
||
|
|
extract_annotations: bool
|
||
|
|
top_margin_fraction: float | None
|
||
|
|
bottom_margin_fraction: float | None
|
||
|
|
allow_single_column_tables: bool
|
||
|
|
ocr_inline_images: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
extract_images: bool | None = None,
|
||
|
|
extract_tables: bool | None = None,
|
||
|
|
passwords: list[str] | None = None,
|
||
|
|
extract_metadata: bool | None = None,
|
||
|
|
hierarchy: HierarchyConfig | None = None,
|
||
|
|
extract_annotations: bool | None = None,
|
||
|
|
top_margin_fraction: float | None = None,
|
||
|
|
bottom_margin_fraction: float | None = None,
|
||
|
|
allow_single_column_tables: bool | None = None,
|
||
|
|
ocr_inline_images: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> PdfConfig: ...
|
||
|
|
|
||
|
|
class HierarchyConfig:
|
||
|
|
enabled: bool
|
||
|
|
k_clusters: int
|
||
|
|
include_bbox: bool
|
||
|
|
ocr_coverage_threshold: float | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enabled: bool | None = None,
|
||
|
|
k_clusters: int | None = None,
|
||
|
|
include_bbox: bool | None = None,
|
||
|
|
ocr_coverage_threshold: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> HierarchyConfig: ...
|
||
|
|
|
||
|
|
class PostProcessorConfig:
|
||
|
|
enabled: bool
|
||
|
|
enabled_processors: list[str] | None
|
||
|
|
disabled_processors: list[str] | None
|
||
|
|
enabled_set: list[str] | None
|
||
|
|
disabled_set: list[str] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enabled: bool | None = None,
|
||
|
|
enabled_processors: list[str] | None = None,
|
||
|
|
disabled_processors: list[str] | None = None,
|
||
|
|
enabled_set: list[str] | None = None,
|
||
|
|
disabled_set: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> PostProcessorConfig: ...
|
||
|
|
|
||
|
|
class ChunkingConfig:
|
||
|
|
max_characters: int
|
||
|
|
overlap: int
|
||
|
|
trim: bool
|
||
|
|
chunker_type: ChunkerType
|
||
|
|
embedding: EmbeddingConfig | None
|
||
|
|
preset: str | None
|
||
|
|
sizing: ChunkSizing
|
||
|
|
prepend_heading_context: bool
|
||
|
|
topic_threshold: float | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
max_characters: int | None = None,
|
||
|
|
overlap: int | None = None,
|
||
|
|
trim: bool | None = None,
|
||
|
|
chunker_type: ChunkerType | str | None = None,
|
||
|
|
embedding: EmbeddingConfig | None = None,
|
||
|
|
preset: str | None = None,
|
||
|
|
sizing: ChunkSizing | None = None,
|
||
|
|
prepend_heading_context: bool | None = None,
|
||
|
|
topic_threshold: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ChunkingConfig: ...
|
||
|
|
|
||
|
|
class EmbeddingConfig:
|
||
|
|
model: EmbeddingModelType
|
||
|
|
normalize: bool
|
||
|
|
batch_size: int
|
||
|
|
show_download_progress: bool
|
||
|
|
cache_dir: str | None
|
||
|
|
acceleration: AccelerationConfig | None
|
||
|
|
max_embed_duration_secs: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
model: EmbeddingModelType | None = None,
|
||
|
|
normalize: bool | None = None,
|
||
|
|
batch_size: int | None = None,
|
||
|
|
show_download_progress: bool | None = None,
|
||
|
|
cache_dir: str | None = None,
|
||
|
|
acceleration: AccelerationConfig | None = None,
|
||
|
|
max_embed_duration_secs: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> EmbeddingConfig: ...
|
||
|
|
|
||
|
|
class TreeSitterConfig:
|
||
|
|
enabled: bool
|
||
|
|
cache_dir: str | None
|
||
|
|
languages: list[str] | None
|
||
|
|
groups: list[str] | None
|
||
|
|
process: TreeSitterProcessConfig
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
enabled: bool | None = None,
|
||
|
|
cache_dir: str | None = None,
|
||
|
|
languages: list[str] | None = None,
|
||
|
|
groups: list[str] | None = None,
|
||
|
|
process: TreeSitterProcessConfig | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> TreeSitterConfig: ...
|
||
|
|
|
||
|
|
class TreeSitterProcessConfig:
|
||
|
|
structure: bool
|
||
|
|
imports: bool
|
||
|
|
exports: bool
|
||
|
|
comments: bool
|
||
|
|
docstrings: bool
|
||
|
|
symbols: bool
|
||
|
|
diagnostics: bool
|
||
|
|
chunk_max_size: int | None
|
||
|
|
content_mode: CodeContentMode
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
structure: bool | None = None,
|
||
|
|
imports: bool | None = None,
|
||
|
|
exports: bool | None = None,
|
||
|
|
comments: bool | None = None,
|
||
|
|
docstrings: bool | None = None,
|
||
|
|
symbols: bool | None = None,
|
||
|
|
diagnostics: bool | None = None,
|
||
|
|
chunk_max_size: int | None = None,
|
||
|
|
content_mode: CodeContentMode | str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> TreeSitterProcessConfig: ...
|
||
|
|
|
||
|
|
class SupportedFormat:
|
||
|
|
extension: str
|
||
|
|
mime_type: str
|
||
|
|
def __init__(self, extension: str, mime_type: str) -> None: ...
|
||
|
|
|
||
|
|
class ServerConfig:
|
||
|
|
host: str
|
||
|
|
port: int
|
||
|
|
cors_origins: list[str]
|
||
|
|
max_request_body_bytes: int
|
||
|
|
max_multipart_field_bytes: int
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
host: str | None = None,
|
||
|
|
port: int | None = None,
|
||
|
|
cors_origins: list[str] | None = None,
|
||
|
|
max_request_body_bytes: int | None = None,
|
||
|
|
max_multipart_field_bytes: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
def listen_addr(self) -> str: ...
|
||
|
|
def cors_allows_all(self) -> bool: ...
|
||
|
|
def is_origin_allowed(self, origin: str) -> bool: ...
|
||
|
|
def max_request_body_mb(self) -> int: ...
|
||
|
|
def max_multipart_field_mb(self) -> int: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ServerConfig: ...
|
||
|
|
|
||
|
|
class StructuredDataResult:
|
||
|
|
content: str
|
||
|
|
format: str
|
||
|
|
metadata: dict[str, str]
|
||
|
|
text_fields: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
format: str, # noqa: A002
|
||
|
|
metadata: dict[str, str],
|
||
|
|
text_fields: list[str],
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DocxAppProperties:
|
||
|
|
application: str | None
|
||
|
|
app_version: str | None
|
||
|
|
template: str | None
|
||
|
|
total_time: int | None
|
||
|
|
pages: int | None
|
||
|
|
words: int | None
|
||
|
|
characters: int | None
|
||
|
|
characters_with_spaces: int | None
|
||
|
|
lines: int | None
|
||
|
|
paragraphs: int | None
|
||
|
|
company: str | None
|
||
|
|
doc_security: int | None
|
||
|
|
scale_crop: bool | None
|
||
|
|
links_up_to_date: bool | None
|
||
|
|
shared_doc: bool | None
|
||
|
|
hyperlinks_changed: bool | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
application: str | None = None,
|
||
|
|
app_version: str | None = None,
|
||
|
|
template: str | None = None,
|
||
|
|
total_time: int | None = None,
|
||
|
|
pages: int | None = None,
|
||
|
|
words: int | None = None,
|
||
|
|
characters: int | None = None,
|
||
|
|
characters_with_spaces: int | None = None,
|
||
|
|
lines: int | None = None,
|
||
|
|
paragraphs: int | None = None,
|
||
|
|
company: str | None = None,
|
||
|
|
doc_security: int | None = None,
|
||
|
|
scale_crop: bool | None = None,
|
||
|
|
links_up_to_date: bool | None = None,
|
||
|
|
shared_doc: bool | None = None,
|
||
|
|
hyperlinks_changed: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class XlsxAppProperties:
|
||
|
|
application: str | None
|
||
|
|
app_version: str | None
|
||
|
|
doc_security: int | None
|
||
|
|
scale_crop: bool | None
|
||
|
|
links_up_to_date: bool | None
|
||
|
|
shared_doc: bool | None
|
||
|
|
hyperlinks_changed: bool | None
|
||
|
|
company: str | None
|
||
|
|
worksheet_names: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
application: str | None = None,
|
||
|
|
app_version: str | None = None,
|
||
|
|
doc_security: int | None = None,
|
||
|
|
scale_crop: bool | None = None,
|
||
|
|
links_up_to_date: bool | None = None,
|
||
|
|
shared_doc: bool | None = None,
|
||
|
|
hyperlinks_changed: bool | None = None,
|
||
|
|
company: str | None = None,
|
||
|
|
worksheet_names: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PptxAppProperties:
|
||
|
|
application: str | None
|
||
|
|
app_version: str | None
|
||
|
|
total_time: int | None
|
||
|
|
company: str | None
|
||
|
|
doc_security: int | None
|
||
|
|
scale_crop: bool | None
|
||
|
|
links_up_to_date: bool | None
|
||
|
|
shared_doc: bool | None
|
||
|
|
hyperlinks_changed: bool | None
|
||
|
|
slides: int | None
|
||
|
|
notes: int | None
|
||
|
|
hidden_slides: int | None
|
||
|
|
multimedia_clips: int | None
|
||
|
|
presentation_format: str | None
|
||
|
|
slide_titles: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
application: str | None = None,
|
||
|
|
app_version: str | None = None,
|
||
|
|
total_time: int | None = None,
|
||
|
|
company: str | None = None,
|
||
|
|
doc_security: int | None = None,
|
||
|
|
scale_crop: bool | None = None,
|
||
|
|
links_up_to_date: bool | None = None,
|
||
|
|
shared_doc: bool | None = None,
|
||
|
|
hyperlinks_changed: bool | None = None,
|
||
|
|
slides: int | None = None,
|
||
|
|
notes: int | None = None,
|
||
|
|
hidden_slides: int | None = None,
|
||
|
|
multimedia_clips: int | None = None,
|
||
|
|
presentation_format: str | None = None,
|
||
|
|
slide_titles: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class CoreProperties:
|
||
|
|
title: str | None
|
||
|
|
subject: str | None
|
||
|
|
creator: str | None
|
||
|
|
keywords: str | None
|
||
|
|
description: str | None
|
||
|
|
last_modified_by: str | None
|
||
|
|
revision: str | None
|
||
|
|
created: str | None
|
||
|
|
modified: str | None
|
||
|
|
category: str | None
|
||
|
|
content_status: str | None
|
||
|
|
language: str | None
|
||
|
|
identifier: str | None
|
||
|
|
version: str | None
|
||
|
|
last_printed: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
title: str | None = None,
|
||
|
|
subject: str | None = None,
|
||
|
|
creator: str | None = None,
|
||
|
|
keywords: str | None = None,
|
||
|
|
description: str | None = None,
|
||
|
|
last_modified_by: str | None = None,
|
||
|
|
revision: str | None = None,
|
||
|
|
created: str | None = None,
|
||
|
|
modified: str | None = None,
|
||
|
|
category: str | None = None,
|
||
|
|
content_status: str | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
identifier: str | None = None,
|
||
|
|
version: str | None = None,
|
||
|
|
last_printed: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class SecurityLimits:
|
||
|
|
max_archive_size: int
|
||
|
|
max_compression_ratio: int
|
||
|
|
max_files_in_archive: int
|
||
|
|
max_nesting_depth: int
|
||
|
|
max_entity_length: int
|
||
|
|
max_content_size: int
|
||
|
|
max_iterations: int
|
||
|
|
max_xml_depth: int
|
||
|
|
max_table_cells: int
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
max_archive_size: int | None = None,
|
||
|
|
max_compression_ratio: int | None = None,
|
||
|
|
max_files_in_archive: int | None = None,
|
||
|
|
max_nesting_depth: int | None = None,
|
||
|
|
max_entity_length: int | None = None,
|
||
|
|
max_content_size: int | None = None,
|
||
|
|
max_iterations: int | None = None,
|
||
|
|
max_xml_depth: int | None = None,
|
||
|
|
max_table_cells: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> SecurityLimits: ...
|
||
|
|
|
||
|
|
class TokenReductionConfig:
|
||
|
|
level: ReductionLevel
|
||
|
|
language_hint: str | None
|
||
|
|
preserve_markdown: bool
|
||
|
|
preserve_code: bool
|
||
|
|
semantic_threshold: float
|
||
|
|
enable_parallel: bool
|
||
|
|
use_simd: bool
|
||
|
|
custom_stopwords: dict[str, list[str]] | None
|
||
|
|
preserve_patterns: list[str]
|
||
|
|
target_reduction: float | None
|
||
|
|
enable_semantic_clustering: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
level: ReductionLevel | str | None = None,
|
||
|
|
language_hint: str | None = None,
|
||
|
|
preserve_markdown: bool | None = None,
|
||
|
|
preserve_code: bool | None = None,
|
||
|
|
semantic_threshold: float | None = None,
|
||
|
|
enable_parallel: bool | None = None,
|
||
|
|
use_simd: bool | None = None,
|
||
|
|
custom_stopwords: dict[str, list[str]] | None = None,
|
||
|
|
preserve_patterns: list[str] | None = None,
|
||
|
|
target_reduction: float | None = None,
|
||
|
|
enable_semantic_clustering: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> TokenReductionConfig: ...
|
||
|
|
|
||
|
|
class PdfAnnotation:
|
||
|
|
annotation_type: PdfAnnotationType
|
||
|
|
content: str | None
|
||
|
|
page_number: int
|
||
|
|
bounding_box: BoundingBox | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
annotation_type: PdfAnnotationType | str,
|
||
|
|
page_number: int,
|
||
|
|
content: str | None = None,
|
||
|
|
bounding_box: BoundingBox | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DjotContent:
|
||
|
|
plain_text: str
|
||
|
|
blocks: list[FormattedBlock]
|
||
|
|
metadata: Metadata
|
||
|
|
tables: list[Table]
|
||
|
|
images: list[DjotImage]
|
||
|
|
links: list[DjotLink]
|
||
|
|
footnotes: list[Footnote]
|
||
|
|
attributes: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
plain_text: str,
|
||
|
|
blocks: list[FormattedBlock],
|
||
|
|
metadata: Metadata,
|
||
|
|
tables: list[Table],
|
||
|
|
images: list[DjotImage],
|
||
|
|
links: list[DjotLink],
|
||
|
|
footnotes: list[Footnote],
|
||
|
|
attributes: list[str],
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class FormattedBlock:
|
||
|
|
block_type: BlockType
|
||
|
|
level: int | None
|
||
|
|
inline_content: list[InlineElement]
|
||
|
|
attributes: str | None
|
||
|
|
language: str | None
|
||
|
|
code: str | None
|
||
|
|
children: list[FormattedBlock]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
block_type: BlockType | str,
|
||
|
|
inline_content: list[InlineElement],
|
||
|
|
children: list[FormattedBlock],
|
||
|
|
level: int | None = None,
|
||
|
|
attributes: str | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
code: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class InlineElement:
|
||
|
|
element_type: InlineType
|
||
|
|
content: str
|
||
|
|
attributes: str | None
|
||
|
|
metadata: dict[str, str] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
element_type: InlineType | str,
|
||
|
|
content: str,
|
||
|
|
attributes: str | None = None,
|
||
|
|
metadata: dict[str, str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DjotImage:
|
||
|
|
src: str
|
||
|
|
alt: str
|
||
|
|
title: str | None
|
||
|
|
attributes: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
src: str,
|
||
|
|
alt: str,
|
||
|
|
title: str | None = None,
|
||
|
|
attributes: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DjotLink:
|
||
|
|
url: str
|
||
|
|
text: str
|
||
|
|
title: str | None
|
||
|
|
attributes: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
text: str,
|
||
|
|
title: str | None = None,
|
||
|
|
attributes: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class Footnote:
|
||
|
|
label: str
|
||
|
|
content: list[FormattedBlock]
|
||
|
|
def __init__(self, label: str, content: list[FormattedBlock]) -> None: ...
|
||
|
|
|
||
|
|
class DocumentStructure:
|
||
|
|
nodes: list[DocumentNode]
|
||
|
|
source_format: str | None
|
||
|
|
relationships: list[DocumentRelationship]
|
||
|
|
node_types: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
nodes: list[DocumentNode] | None = None,
|
||
|
|
source_format: str | None = None,
|
||
|
|
relationships: list[DocumentRelationship] | None = None,
|
||
|
|
node_types: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
def finalize_node_types(self) -> None: ...
|
||
|
|
def is_empty(self) -> bool: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> DocumentStructure: ...
|
||
|
|
|
||
|
|
class DocumentRelationship:
|
||
|
|
source: int
|
||
|
|
target: int
|
||
|
|
kind: RelationshipKind
|
||
|
|
def __init__(self, source: int, target: int, kind: RelationshipKind | str) -> None: ...
|
||
|
|
|
||
|
|
class DocumentNode:
|
||
|
|
id: str
|
||
|
|
content: NodeContent
|
||
|
|
parent: int | None
|
||
|
|
children: list[int]
|
||
|
|
content_layer: ContentLayer
|
||
|
|
page: int | None
|
||
|
|
page_end: int | None
|
||
|
|
bbox: BoundingBox | None
|
||
|
|
annotations: list[TextAnnotation]
|
||
|
|
attributes: dict[str, str] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
id: str, # noqa: A002
|
||
|
|
content: NodeContent,
|
||
|
|
children: list[int],
|
||
|
|
content_layer: ContentLayer | str,
|
||
|
|
annotations: list[TextAnnotation],
|
||
|
|
parent: int | None = None,
|
||
|
|
page: int | None = None,
|
||
|
|
page_end: int | None = None,
|
||
|
|
bbox: BoundingBox | None = None,
|
||
|
|
attributes: dict[str, str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class TableGrid:
|
||
|
|
rows: int
|
||
|
|
cols: int
|
||
|
|
cells: list[GridCell]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
rows: int | None = None,
|
||
|
|
cols: int | None = None,
|
||
|
|
cells: list[GridCell] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class GridCell:
|
||
|
|
content: str
|
||
|
|
row: int
|
||
|
|
col: int
|
||
|
|
row_span: int
|
||
|
|
col_span: int
|
||
|
|
is_header: bool
|
||
|
|
bbox: BoundingBox | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
row: int,
|
||
|
|
col: int,
|
||
|
|
row_span: int,
|
||
|
|
col_span: int,
|
||
|
|
is_header: bool,
|
||
|
|
bbox: BoundingBox | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class TextAnnotation:
|
||
|
|
start: int
|
||
|
|
end: int
|
||
|
|
kind: AnnotationKind
|
||
|
|
def __init__(self, start: int, end: int, kind: AnnotationKind) -> None: ...
|
||
|
|
|
||
|
|
class ExtractionResult:
|
||
|
|
content: str
|
||
|
|
mime_type: str
|
||
|
|
metadata: Metadata
|
||
|
|
extraction_method: ExtractionMethod | None
|
||
|
|
tables: list[Table]
|
||
|
|
detected_languages: list[str] | None
|
||
|
|
chunks: list[Chunk] | None
|
||
|
|
images: list[ExtractedImage] | None
|
||
|
|
pages: list[PageContent] | None
|
||
|
|
elements: list[Element] | None
|
||
|
|
djot_content: DjotContent | None
|
||
|
|
ocr_elements: list[OcrElement] | None
|
||
|
|
document: DocumentStructure | None
|
||
|
|
extracted_keywords: list[Keyword] | None
|
||
|
|
quality_score: float | None
|
||
|
|
processing_warnings: list[ProcessingWarning]
|
||
|
|
annotations: list[PdfAnnotation] | None
|
||
|
|
children: list[ArchiveEntry] | None
|
||
|
|
uris: list[ExtractedUri] | None
|
||
|
|
revisions: list[DocumentRevision] | None
|
||
|
|
structured_output: dict[str, Any] | None
|
||
|
|
code_intelligence: dict[str, Any] | None
|
||
|
|
llm_usage: list[LlmUsage] | None
|
||
|
|
formatted_content: str | None
|
||
|
|
ocr_internal_document: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str | None = None,
|
||
|
|
mime_type: str | None = None,
|
||
|
|
metadata: Metadata | None = None,
|
||
|
|
extraction_method: ExtractionMethod | str | None = None,
|
||
|
|
tables: list[Table] | None = None,
|
||
|
|
detected_languages: list[str] | None = None,
|
||
|
|
chunks: list[Chunk] | None = None,
|
||
|
|
images: list[ExtractedImage] | None = None,
|
||
|
|
pages: list[PageContent] | None = None,
|
||
|
|
elements: list[Element] | None = None,
|
||
|
|
djot_content: DjotContent | None = None,
|
||
|
|
ocr_elements: list[OcrElement] | None = None,
|
||
|
|
document: DocumentStructure | None = None,
|
||
|
|
quality_score: float | None = None,
|
||
|
|
processing_warnings: list[ProcessingWarning] | None = None,
|
||
|
|
annotations: list[PdfAnnotation] | None = None,
|
||
|
|
children: list[ArchiveEntry] | None = None,
|
||
|
|
uris: list[ExtractedUri] | None = None,
|
||
|
|
revisions: list[DocumentRevision] | None = None,
|
||
|
|
structured_output: dict[str, Any] | None = None,
|
||
|
|
llm_usage: list[LlmUsage] | None = None,
|
||
|
|
formatted_content: str | None = None,
|
||
|
|
ocr_internal_document: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def from_ocr(ocr: OcrExtractionResult) -> ExtractionResult: ...
|
||
|
|
|
||
|
|
class ArchiveEntry:
|
||
|
|
path: str
|
||
|
|
mime_type: str
|
||
|
|
result: ExtractionResult
|
||
|
|
def __init__(self, path: str, mime_type: str, result: ExtractionResult) -> None: ...
|
||
|
|
|
||
|
|
class ProcessingWarning:
|
||
|
|
source: str
|
||
|
|
message: str
|
||
|
|
def __init__(self, source: str, message: str) -> None: ...
|
||
|
|
|
||
|
|
class LlmUsage:
|
||
|
|
model: str
|
||
|
|
source: str
|
||
|
|
input_tokens: int | None
|
||
|
|
output_tokens: int | None
|
||
|
|
total_tokens: int | None
|
||
|
|
estimated_cost: float | None
|
||
|
|
finish_reason: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
model: str | None = None,
|
||
|
|
source: str | None = None,
|
||
|
|
input_tokens: int | None = None,
|
||
|
|
output_tokens: int | None = None,
|
||
|
|
total_tokens: int | None = None,
|
||
|
|
estimated_cost: float | None = None,
|
||
|
|
finish_reason: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class Chunk:
|
||
|
|
content: str
|
||
|
|
chunk_type: ChunkType
|
||
|
|
embedding: list[float] | None
|
||
|
|
metadata: ChunkMetadata
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
chunk_type: ChunkType | str,
|
||
|
|
metadata: ChunkMetadata,
|
||
|
|
embedding: list[float] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class HeadingContext:
|
||
|
|
headings: list[HeadingLevel]
|
||
|
|
def __init__(self, headings: list[HeadingLevel]) -> None: ...
|
||
|
|
|
||
|
|
class HeadingLevel:
|
||
|
|
level: int
|
||
|
|
text: str
|
||
|
|
def __init__(self, level: int, text: str) -> None: ...
|
||
|
|
|
||
|
|
class ChunkMetadata:
|
||
|
|
byte_start: int
|
||
|
|
byte_end: int
|
||
|
|
token_count: int | None
|
||
|
|
chunk_index: int
|
||
|
|
total_chunks: int
|
||
|
|
first_page: int | None
|
||
|
|
last_page: int | None
|
||
|
|
heading_context: HeadingContext | None
|
||
|
|
image_indices: list[int]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
byte_start: int,
|
||
|
|
byte_end: int,
|
||
|
|
chunk_index: int,
|
||
|
|
total_chunks: int,
|
||
|
|
image_indices: list[int],
|
||
|
|
token_count: int | None = None,
|
||
|
|
first_page: int | None = None,
|
||
|
|
last_page: int | None = None,
|
||
|
|
heading_context: HeadingContext | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ExtractedImage:
|
||
|
|
data: bytes
|
||
|
|
format: str
|
||
|
|
image_index: int
|
||
|
|
page_number: int | None
|
||
|
|
width: int | None
|
||
|
|
height: int | None
|
||
|
|
colorspace: str | None
|
||
|
|
bits_per_component: int | None
|
||
|
|
is_mask: bool
|
||
|
|
description: str | None
|
||
|
|
ocr_result: ExtractionResult | None
|
||
|
|
bounding_box: BoundingBox | None
|
||
|
|
source_path: str | None
|
||
|
|
image_kind: ImageKind | None
|
||
|
|
kind_confidence: float | None
|
||
|
|
cluster_id: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
data: bytes,
|
||
|
|
format: str, # noqa: A002
|
||
|
|
image_index: int,
|
||
|
|
is_mask: bool,
|
||
|
|
page_number: int | None = None,
|
||
|
|
width: int | None = None,
|
||
|
|
height: int | None = None,
|
||
|
|
colorspace: str | None = None,
|
||
|
|
bits_per_component: int | None = None,
|
||
|
|
description: str | None = None,
|
||
|
|
ocr_result: ExtractionResult | None = None,
|
||
|
|
bounding_box: BoundingBox | None = None,
|
||
|
|
source_path: str | None = None,
|
||
|
|
image_kind: ImageKind | str | None = None,
|
||
|
|
kind_confidence: float | None = None,
|
||
|
|
cluster_id: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class BoundingBox:
|
||
|
|
x0: float
|
||
|
|
y0: float
|
||
|
|
x1: float
|
||
|
|
y1: float
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
x0: float | None = None,
|
||
|
|
y0: float | None = None,
|
||
|
|
x1: float | None = None,
|
||
|
|
y1: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ElementMetadata:
|
||
|
|
page_number: int | None
|
||
|
|
filename: str | None
|
||
|
|
coordinates: BoundingBox | None
|
||
|
|
element_index: int | None
|
||
|
|
additional: dict[str, str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
additional: dict[str, str],
|
||
|
|
page_number: int | None = None,
|
||
|
|
filename: str | None = None,
|
||
|
|
coordinates: BoundingBox | None = None,
|
||
|
|
element_index: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class Element:
|
||
|
|
element_id: str
|
||
|
|
element_type: ElementType
|
||
|
|
text: str
|
||
|
|
metadata: ElementMetadata
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
element_id: str,
|
||
|
|
element_type: ElementType | str,
|
||
|
|
text: str,
|
||
|
|
metadata: ElementMetadata,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ExcelWorkbook:
|
||
|
|
sheets: list[ExcelSheet]
|
||
|
|
metadata: dict[str, str]
|
||
|
|
revisions: list[DocumentRevision] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
sheets: list[ExcelSheet],
|
||
|
|
metadata: dict[str, str],
|
||
|
|
revisions: list[DocumentRevision] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ExcelSheet:
|
||
|
|
name: str
|
||
|
|
markdown: str
|
||
|
|
row_count: int
|
||
|
|
col_count: int
|
||
|
|
cell_count: int
|
||
|
|
table_cells: list[list[str]] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
name: str,
|
||
|
|
markdown: str,
|
||
|
|
row_count: int,
|
||
|
|
col_count: int,
|
||
|
|
cell_count: int,
|
||
|
|
table_cells: list[list[str]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class XmlExtractionResult:
|
||
|
|
content: str
|
||
|
|
element_count: int
|
||
|
|
unique_elements: list[str]
|
||
|
|
def __init__(self, content: str, element_count: int, unique_elements: list[str]) -> None: ...
|
||
|
|
|
||
|
|
class TextExtractionResult:
|
||
|
|
content: str
|
||
|
|
line_count: int
|
||
|
|
word_count: int
|
||
|
|
character_count: int
|
||
|
|
headers: list[str] | None
|
||
|
|
links: list[list[str]] | None
|
||
|
|
code_blocks: list[list[str]] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
line_count: int,
|
||
|
|
word_count: int,
|
||
|
|
character_count: int,
|
||
|
|
headers: list[str] | None = None,
|
||
|
|
links: list[list[str]] | None = None,
|
||
|
|
code_blocks: list[list[str]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PptxExtractionResult:
|
||
|
|
content: str
|
||
|
|
metadata: PptxMetadata
|
||
|
|
slide_count: int
|
||
|
|
image_count: int
|
||
|
|
table_count: int
|
||
|
|
images: list[ExtractedImage]
|
||
|
|
page_structure: PageStructure | None
|
||
|
|
page_contents: list[PageContent] | None
|
||
|
|
document: DocumentStructure | None
|
||
|
|
hyperlinks: list[str]
|
||
|
|
office_metadata: dict[str, str]
|
||
|
|
revisions: list[DocumentRevision] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
metadata: PptxMetadata,
|
||
|
|
slide_count: int,
|
||
|
|
image_count: int,
|
||
|
|
table_count: int,
|
||
|
|
images: list[ExtractedImage],
|
||
|
|
hyperlinks: list[str],
|
||
|
|
office_metadata: dict[str, str],
|
||
|
|
page_structure: PageStructure | None = None,
|
||
|
|
page_contents: list[PageContent] | None = None,
|
||
|
|
document: DocumentStructure | None = None,
|
||
|
|
revisions: list[DocumentRevision] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class EmailExtractionResult:
|
||
|
|
subject: str | None
|
||
|
|
from_email: str | None
|
||
|
|
to_emails: list[str]
|
||
|
|
cc_emails: list[str]
|
||
|
|
bcc_emails: list[str]
|
||
|
|
date: str | None
|
||
|
|
message_id: str | None
|
||
|
|
plain_text: str | None
|
||
|
|
html_content: str | None
|
||
|
|
content: str
|
||
|
|
attachments: list[EmailAttachment]
|
||
|
|
metadata: dict[str, str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
to_emails: list[str],
|
||
|
|
cc_emails: list[str],
|
||
|
|
bcc_emails: list[str],
|
||
|
|
content: str,
|
||
|
|
attachments: list[EmailAttachment],
|
||
|
|
metadata: dict[str, str],
|
||
|
|
subject: str | None = None,
|
||
|
|
from_email: str | None = None,
|
||
|
|
date: str | None = None,
|
||
|
|
message_id: str | None = None,
|
||
|
|
plain_text: str | None = None,
|
||
|
|
html_content: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class EmailAttachment:
|
||
|
|
name: str | None
|
||
|
|
filename: str | None
|
||
|
|
mime_type: str | None
|
||
|
|
size: int | None
|
||
|
|
is_image: bool
|
||
|
|
data: bytes | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
is_image: bool,
|
||
|
|
name: str | None = None,
|
||
|
|
filename: str | None = None,
|
||
|
|
mime_type: str | None = None,
|
||
|
|
size: int | None = None,
|
||
|
|
data: bytes | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrExtractionResult:
|
||
|
|
content: str
|
||
|
|
mime_type: str
|
||
|
|
metadata: dict[str, dict[str, Any]]
|
||
|
|
tables: list[OcrTable]
|
||
|
|
ocr_elements: list[OcrElement] | None
|
||
|
|
internal_document: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str,
|
||
|
|
mime_type: str,
|
||
|
|
metadata: dict[str, dict[str, Any]],
|
||
|
|
tables: list[OcrTable],
|
||
|
|
ocr_elements: list[OcrElement] | None = None,
|
||
|
|
internal_document: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrTable:
|
||
|
|
cells: list[list[str]]
|
||
|
|
markdown: str
|
||
|
|
page_number: int
|
||
|
|
bounding_box: OcrTableBoundingBox | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
cells: list[list[str]],
|
||
|
|
markdown: str,
|
||
|
|
page_number: int,
|
||
|
|
bounding_box: OcrTableBoundingBox | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrTableBoundingBox:
|
||
|
|
left: int
|
||
|
|
top: int
|
||
|
|
right: int
|
||
|
|
bottom: int
|
||
|
|
def __init__(self, left: int, top: int, right: int, bottom: int) -> None: ...
|
||
|
|
|
||
|
|
class ImagePreprocessingConfig:
|
||
|
|
target_dpi: int
|
||
|
|
auto_rotate: bool
|
||
|
|
deskew: bool
|
||
|
|
denoise: bool
|
||
|
|
contrast_enhance: bool
|
||
|
|
binarization_method: str
|
||
|
|
invert_colors: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
target_dpi: int | None = None,
|
||
|
|
auto_rotate: bool | None = None,
|
||
|
|
deskew: bool | None = None,
|
||
|
|
denoise: bool | None = None,
|
||
|
|
contrast_enhance: bool | None = None,
|
||
|
|
binarization_method: str | None = None,
|
||
|
|
invert_colors: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> ImagePreprocessingConfig: ...
|
||
|
|
|
||
|
|
class TesseractConfig:
|
||
|
|
language: str
|
||
|
|
psm: int
|
||
|
|
output_format: str
|
||
|
|
oem: int
|
||
|
|
min_confidence: float
|
||
|
|
preprocessing: ImagePreprocessingConfig | None
|
||
|
|
enable_table_detection: bool
|
||
|
|
table_min_confidence: float
|
||
|
|
table_column_threshold: int
|
||
|
|
table_row_threshold_ratio: float
|
||
|
|
use_cache: bool
|
||
|
|
classify_use_pre_adapted_templates: bool
|
||
|
|
language_model_ngram_on: bool
|
||
|
|
tessedit_dont_blkrej_good_wds: bool
|
||
|
|
tessedit_dont_rowrej_good_wds: bool
|
||
|
|
tessedit_enable_dict_correction: bool
|
||
|
|
tessedit_char_whitelist: str
|
||
|
|
tessedit_char_blacklist: str
|
||
|
|
tessedit_use_primary_params_model: bool
|
||
|
|
textord_space_size_is_variable: bool
|
||
|
|
thresholding_method: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
language: str | None = None,
|
||
|
|
psm: int | None = None,
|
||
|
|
output_format: str | None = None,
|
||
|
|
oem: int | None = None,
|
||
|
|
min_confidence: float | None = None,
|
||
|
|
preprocessing: ImagePreprocessingConfig | None = None,
|
||
|
|
enable_table_detection: bool | None = None,
|
||
|
|
table_min_confidence: float | None = None,
|
||
|
|
table_column_threshold: int | None = None,
|
||
|
|
table_row_threshold_ratio: float | None = None,
|
||
|
|
use_cache: bool | None = None,
|
||
|
|
classify_use_pre_adapted_templates: bool | None = None,
|
||
|
|
language_model_ngram_on: bool | None = None,
|
||
|
|
tessedit_dont_blkrej_good_wds: bool | None = None,
|
||
|
|
tessedit_dont_rowrej_good_wds: bool | None = None,
|
||
|
|
tessedit_enable_dict_correction: bool | None = None,
|
||
|
|
tessedit_char_whitelist: str | None = None,
|
||
|
|
tessedit_char_blacklist: str | None = None,
|
||
|
|
tessedit_use_primary_params_model: bool | None = None,
|
||
|
|
textord_space_size_is_variable: bool | None = None,
|
||
|
|
thresholding_method: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> TesseractConfig: ...
|
||
|
|
|
||
|
|
class ImagePreprocessingMetadata:
|
||
|
|
original_dimensions: list[int]
|
||
|
|
original_dpi: list[float]
|
||
|
|
target_dpi: int
|
||
|
|
scale_factor: float
|
||
|
|
auto_adjusted: bool
|
||
|
|
final_dpi: int
|
||
|
|
new_dimensions: list[int] | None
|
||
|
|
resample_method: str
|
||
|
|
dimension_clamped: bool
|
||
|
|
calculated_dpi: int | None
|
||
|
|
skipped_resize: bool
|
||
|
|
resize_error: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
original_dimensions: list[int],
|
||
|
|
original_dpi: list[float],
|
||
|
|
target_dpi: int,
|
||
|
|
scale_factor: float,
|
||
|
|
auto_adjusted: bool,
|
||
|
|
final_dpi: int,
|
||
|
|
resample_method: str,
|
||
|
|
dimension_clamped: bool,
|
||
|
|
skipped_resize: bool,
|
||
|
|
new_dimensions: list[int] | None = None,
|
||
|
|
calculated_dpi: int | None = None,
|
||
|
|
resize_error: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class Metadata:
|
||
|
|
title: str | None
|
||
|
|
subject: str | None
|
||
|
|
authors: list[str] | None
|
||
|
|
keywords: list[str] | None
|
||
|
|
language: str | None
|
||
|
|
created_at: str | None
|
||
|
|
modified_at: str | None
|
||
|
|
created_by: str | None
|
||
|
|
modified_by: str | None
|
||
|
|
pages: PageStructure | None
|
||
|
|
format: FormatMetadata | None
|
||
|
|
image_preprocessing: ImagePreprocessingMetadata | None
|
||
|
|
json_schema: dict[str, Any] | None
|
||
|
|
error: ErrorMetadata | None
|
||
|
|
extraction_duration_ms: int | None
|
||
|
|
category: str | None
|
||
|
|
tags: list[str] | None
|
||
|
|
document_version: str | None
|
||
|
|
abstract_text: str | None
|
||
|
|
output_format: str | None
|
||
|
|
ocr_used: bool
|
||
|
|
additional: dict[str, dict[str, Any]]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
title: str | None = None,
|
||
|
|
subject: str | None = None,
|
||
|
|
authors: list[str] | None = None,
|
||
|
|
keywords: list[str] | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
created_at: str | None = None,
|
||
|
|
modified_at: str | None = None,
|
||
|
|
created_by: str | None = None,
|
||
|
|
modified_by: str | None = None,
|
||
|
|
pages: PageStructure | None = None,
|
||
|
|
format: FormatMetadata | None = None, # noqa: A002
|
||
|
|
image_preprocessing: ImagePreprocessingMetadata | None = None,
|
||
|
|
json_schema: dict[str, Any] | None = None,
|
||
|
|
error: ErrorMetadata | None = None,
|
||
|
|
extraction_duration_ms: int | None = None,
|
||
|
|
category: str | None = None,
|
||
|
|
tags: list[str] | None = None,
|
||
|
|
document_version: str | None = None,
|
||
|
|
abstract_text: str | None = None,
|
||
|
|
output_format: str | None = None,
|
||
|
|
ocr_used: bool | None = None,
|
||
|
|
additional: dict[str, dict[str, Any]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
def is_empty(self) -> bool: ...
|
||
|
|
|
||
|
|
class ExcelMetadata:
|
||
|
|
sheet_count: int | None
|
||
|
|
sheet_names: list[str] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
sheet_count: int | None = None,
|
||
|
|
sheet_names: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class EmailMetadata:
|
||
|
|
from_email: str | None
|
||
|
|
from_name: str | None
|
||
|
|
to_emails: list[str]
|
||
|
|
cc_emails: list[str]
|
||
|
|
bcc_emails: list[str]
|
||
|
|
message_id: str | None
|
||
|
|
attachments: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
from_email: str | None = None,
|
||
|
|
from_name: str | None = None,
|
||
|
|
to_emails: list[str] | None = None,
|
||
|
|
cc_emails: list[str] | None = None,
|
||
|
|
bcc_emails: list[str] | None = None,
|
||
|
|
message_id: str | None = None,
|
||
|
|
attachments: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ArchiveMetadata:
|
||
|
|
format: str
|
||
|
|
file_count: int
|
||
|
|
file_list: list[str]
|
||
|
|
total_size: int
|
||
|
|
compressed_size: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
format: str | None = None, # noqa: A002
|
||
|
|
file_count: int | None = None,
|
||
|
|
file_list: list[str] | None = None,
|
||
|
|
total_size: int | None = None,
|
||
|
|
compressed_size: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ImageMetadata:
|
||
|
|
width: int
|
||
|
|
height: int
|
||
|
|
format: str
|
||
|
|
exif: dict[str, str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
width: int | None = None,
|
||
|
|
height: int | None = None,
|
||
|
|
format: str | None = None, # noqa: A002
|
||
|
|
exif: dict[str, str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class XmlMetadata:
|
||
|
|
element_count: int
|
||
|
|
unique_elements: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
element_count: int | None = None,
|
||
|
|
unique_elements: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class TextMetadata:
|
||
|
|
line_count: int
|
||
|
|
word_count: int
|
||
|
|
character_count: int
|
||
|
|
headers: list[str] | None
|
||
|
|
links: list[list[str]] | None
|
||
|
|
code_blocks: list[list[str]] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
line_count: int | None = None,
|
||
|
|
word_count: int | None = None,
|
||
|
|
character_count: int | None = None,
|
||
|
|
headers: list[str] | None = None,
|
||
|
|
links: list[list[str]] | None = None,
|
||
|
|
code_blocks: list[list[str]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class HeaderMetadata:
|
||
|
|
level: int
|
||
|
|
text: str
|
||
|
|
id: str | None
|
||
|
|
depth: int
|
||
|
|
html_offset: int
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
level: int,
|
||
|
|
text: str,
|
||
|
|
depth: int,
|
||
|
|
html_offset: int,
|
||
|
|
id: str | None = None, # noqa: A002
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class LinkMetadata:
|
||
|
|
href: str
|
||
|
|
text: str
|
||
|
|
title: str | None
|
||
|
|
link_type: LinkType
|
||
|
|
rel: list[str]
|
||
|
|
attributes: list[list[str]]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
href: str,
|
||
|
|
text: str,
|
||
|
|
link_type: LinkType | str,
|
||
|
|
rel: list[str],
|
||
|
|
attributes: list[list[str]],
|
||
|
|
title: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ImageMetadataType:
|
||
|
|
src: str
|
||
|
|
alt: str | None
|
||
|
|
title: str | None
|
||
|
|
dimensions: list[int] | None
|
||
|
|
image_type: ImageType
|
||
|
|
attributes: list[list[str]]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
src: str,
|
||
|
|
image_type: ImageType | str,
|
||
|
|
attributes: list[list[str]],
|
||
|
|
alt: str | None = None,
|
||
|
|
title: str | None = None,
|
||
|
|
dimensions: list[int] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class StructuredData:
|
||
|
|
data_type: StructuredDataType
|
||
|
|
raw_json: str
|
||
|
|
schema_type: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
data_type: StructuredDataType | str,
|
||
|
|
raw_json: str,
|
||
|
|
schema_type: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class HtmlMetadata:
|
||
|
|
title: str | None
|
||
|
|
description: str | None
|
||
|
|
keywords: list[str]
|
||
|
|
author: str | None
|
||
|
|
canonical_url: str | None
|
||
|
|
base_href: str | None
|
||
|
|
language: str | None
|
||
|
|
text_direction: TextDirection | None
|
||
|
|
open_graph: dict[str, str]
|
||
|
|
twitter_card: dict[str, str]
|
||
|
|
meta_tags: dict[str, str]
|
||
|
|
headers: list[HeaderMetadata]
|
||
|
|
links: list[LinkMetadata]
|
||
|
|
images: list[ImageMetadataType]
|
||
|
|
structured_data: list[StructuredData]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
title: str | None = None,
|
||
|
|
description: str | None = None,
|
||
|
|
keywords: list[str] | None = None,
|
||
|
|
author: str | None = None,
|
||
|
|
canonical_url: str | None = None,
|
||
|
|
base_href: str | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
text_direction: TextDirection | str | None = None,
|
||
|
|
open_graph: dict[str, str] | None = None,
|
||
|
|
twitter_card: dict[str, str] | None = None,
|
||
|
|
meta_tags: dict[str, str] | None = None,
|
||
|
|
headers: list[HeaderMetadata] | None = None,
|
||
|
|
links: list[LinkMetadata] | None = None,
|
||
|
|
images: list[ImageMetadataType] | None = None,
|
||
|
|
structured_data: list[StructuredData] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrMetadata:
|
||
|
|
language: str
|
||
|
|
psm: int
|
||
|
|
output_format: str
|
||
|
|
table_count: int
|
||
|
|
table_rows: int | None
|
||
|
|
table_cols: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
language: str | None = None,
|
||
|
|
psm: int | None = None,
|
||
|
|
output_format: str | None = None,
|
||
|
|
table_count: int | None = None,
|
||
|
|
table_rows: int | None = None,
|
||
|
|
table_cols: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ErrorMetadata:
|
||
|
|
error_type: str
|
||
|
|
message: str
|
||
|
|
def __init__(self, error_type: str, message: str) -> None: ...
|
||
|
|
|
||
|
|
class PptxMetadata:
|
||
|
|
slide_count: int
|
||
|
|
slide_names: list[str]
|
||
|
|
image_count: int | None
|
||
|
|
table_count: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
slide_count: int | None = None,
|
||
|
|
slide_names: list[str] | None = None,
|
||
|
|
image_count: int | None = None,
|
||
|
|
table_count: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DocxMetadata:
|
||
|
|
core_properties: CoreProperties | None
|
||
|
|
app_properties: DocxAppProperties | None
|
||
|
|
custom_properties: dict[str, dict[str, Any]] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
core_properties: CoreProperties | None = None,
|
||
|
|
app_properties: DocxAppProperties | None = None,
|
||
|
|
custom_properties: dict[str, dict[str, Any]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class CsvMetadata:
|
||
|
|
row_count: int
|
||
|
|
column_count: int
|
||
|
|
delimiter: str | None
|
||
|
|
has_header: bool
|
||
|
|
column_types: list[str] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
row_count: int | None = None,
|
||
|
|
column_count: int | None = None,
|
||
|
|
delimiter: str | None = None,
|
||
|
|
has_header: bool | None = None,
|
||
|
|
column_types: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class BibtexMetadata:
|
||
|
|
entry_count: int
|
||
|
|
citation_keys: list[str]
|
||
|
|
authors: list[str]
|
||
|
|
year_range: YearRange | None
|
||
|
|
entry_types: dict[str, int] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
entry_count: int | None = None,
|
||
|
|
citation_keys: list[str] | None = None,
|
||
|
|
authors: list[str] | None = None,
|
||
|
|
year_range: YearRange | None = None,
|
||
|
|
entry_types: dict[str, int] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class CitationMetadata:
|
||
|
|
citation_count: int
|
||
|
|
format: str | None
|
||
|
|
authors: list[str]
|
||
|
|
year_range: YearRange | None
|
||
|
|
dois: list[str]
|
||
|
|
keywords: list[str]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
citation_count: int | None = None,
|
||
|
|
format: str | None = None, # noqa: A002
|
||
|
|
authors: list[str] | None = None,
|
||
|
|
year_range: YearRange | None = None,
|
||
|
|
dois: list[str] | None = None,
|
||
|
|
keywords: list[str] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class YearRange:
|
||
|
|
min: int | None
|
||
|
|
max: int | None
|
||
|
|
years: list[int]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
years: list[int],
|
||
|
|
min: int | None = None, # noqa: A002
|
||
|
|
max: int | None = None, # noqa: A002
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class FictionBookMetadata:
|
||
|
|
genres: list[str]
|
||
|
|
sequences: list[str]
|
||
|
|
annotation: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
genres: list[str] | None = None,
|
||
|
|
sequences: list[str] | None = None,
|
||
|
|
annotation: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DbfMetadata:
|
||
|
|
record_count: int
|
||
|
|
field_count: int
|
||
|
|
fields: list[DbfFieldInfo]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
record_count: int | None = None,
|
||
|
|
field_count: int | None = None,
|
||
|
|
fields: list[DbfFieldInfo] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DbfFieldInfo:
|
||
|
|
name: str
|
||
|
|
field_type: str
|
||
|
|
def __init__(self, name: str, field_type: str) -> None: ...
|
||
|
|
|
||
|
|
class JatsMetadata:
|
||
|
|
copyright: str | None
|
||
|
|
license: str | None
|
||
|
|
history_dates: dict[str, str]
|
||
|
|
contributor_roles: list[ContributorRole]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
copyright: str | None = None, # noqa: A002
|
||
|
|
license: str | None = None, # noqa: A002
|
||
|
|
history_dates: dict[str, str] | None = None,
|
||
|
|
contributor_roles: list[ContributorRole] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ContributorRole:
|
||
|
|
name: str
|
||
|
|
role: str | None
|
||
|
|
def __init__(self, name: str, role: str | None = None) -> None: ...
|
||
|
|
|
||
|
|
class EpubMetadata:
|
||
|
|
coverage: str | None
|
||
|
|
dc_format: str | None
|
||
|
|
relation: str | None
|
||
|
|
source: str | None
|
||
|
|
dc_type: str | None
|
||
|
|
cover_image: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
coverage: str | None = None,
|
||
|
|
dc_format: str | None = None,
|
||
|
|
relation: str | None = None,
|
||
|
|
source: str | None = None,
|
||
|
|
dc_type: str | None = None,
|
||
|
|
cover_image: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PstMetadata:
|
||
|
|
message_count: int
|
||
|
|
def __init__(self, message_count: int | None = None) -> None: ...
|
||
|
|
|
||
|
|
class OcrConfidence:
|
||
|
|
detection: float | None
|
||
|
|
recognition: float
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
detection: float | None = None,
|
||
|
|
recognition: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrRotation:
|
||
|
|
angle_degrees: float
|
||
|
|
confidence: float | None
|
||
|
|
def __init__(self, angle_degrees: float, confidence: float | None = None) -> None: ...
|
||
|
|
|
||
|
|
class OcrElement:
|
||
|
|
text: str
|
||
|
|
geometry: OcrBoundingGeometry
|
||
|
|
confidence: OcrConfidence
|
||
|
|
level: OcrElementLevel
|
||
|
|
rotation: OcrRotation | None
|
||
|
|
page_number: int
|
||
|
|
parent_id: str | None
|
||
|
|
backend_metadata: dict[str, dict[str, Any]]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
text: str | None = None,
|
||
|
|
geometry: OcrBoundingGeometry | None = None,
|
||
|
|
confidence: OcrConfidence | None = None,
|
||
|
|
level: OcrElementLevel | str | None = None,
|
||
|
|
rotation: OcrRotation | None = None,
|
||
|
|
page_number: int | None = None,
|
||
|
|
parent_id: str | None = None,
|
||
|
|
backend_metadata: dict[str, dict[str, Any]] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class OcrElementConfig:
|
||
|
|
include_elements: bool
|
||
|
|
min_level: OcrElementLevel
|
||
|
|
min_confidence: float
|
||
|
|
build_hierarchy: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
include_elements: bool | None = None,
|
||
|
|
min_level: OcrElementLevel | str | None = None,
|
||
|
|
min_confidence: float | None = None,
|
||
|
|
build_hierarchy: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PageStructure:
|
||
|
|
total_count: int
|
||
|
|
unit_type: PageUnitType
|
||
|
|
boundaries: list[PageBoundary] | None
|
||
|
|
pages: list[PageInfo] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
total_count: int,
|
||
|
|
unit_type: PageUnitType | str,
|
||
|
|
boundaries: list[PageBoundary] | None = None,
|
||
|
|
pages: list[PageInfo] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PageBoundary:
|
||
|
|
byte_start: int
|
||
|
|
byte_end: int
|
||
|
|
page_number: int
|
||
|
|
def __init__(self, byte_start: int, byte_end: int, page_number: int) -> None: ...
|
||
|
|
|
||
|
|
class PageInfo:
|
||
|
|
number: int
|
||
|
|
title: str | None
|
||
|
|
dimensions: list[float] | None
|
||
|
|
image_count: int | None
|
||
|
|
table_count: int | None
|
||
|
|
hidden: bool | None
|
||
|
|
is_blank: bool | None
|
||
|
|
has_vector_graphics: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
number: int,
|
||
|
|
has_vector_graphics: bool,
|
||
|
|
title: str | None = None,
|
||
|
|
dimensions: list[float] | None = None,
|
||
|
|
image_count: int | None = None,
|
||
|
|
table_count: int | None = None,
|
||
|
|
hidden: bool | None = None,
|
||
|
|
is_blank: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PageContent:
|
||
|
|
page_number: int
|
||
|
|
content: str
|
||
|
|
tables: list[Table]
|
||
|
|
image_indices: list[int]
|
||
|
|
hierarchy: PageHierarchy | None
|
||
|
|
is_blank: bool | None
|
||
|
|
layout_regions: list[LayoutRegion] | None
|
||
|
|
speaker_notes: str | None
|
||
|
|
section_name: str | None
|
||
|
|
sheet_name: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
page_number: int,
|
||
|
|
content: str,
|
||
|
|
tables: list[Table],
|
||
|
|
image_indices: list[int],
|
||
|
|
hierarchy: PageHierarchy | None = None,
|
||
|
|
is_blank: bool | None = None,
|
||
|
|
layout_regions: list[LayoutRegion] | None = None,
|
||
|
|
speaker_notes: str | None = None,
|
||
|
|
section_name: str | None = None,
|
||
|
|
sheet_name: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class LayoutRegion:
|
||
|
|
class_name: str
|
||
|
|
confidence: float
|
||
|
|
bounding_box: BoundingBox
|
||
|
|
area_fraction: float
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
class_name: str | None = None,
|
||
|
|
confidence: float | None = None,
|
||
|
|
bounding_box: BoundingBox | None = None,
|
||
|
|
area_fraction: float | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PageHierarchy:
|
||
|
|
block_count: int
|
||
|
|
blocks: list[HierarchicalBlock]
|
||
|
|
def __init__(self, block_count: int, blocks: list[HierarchicalBlock]) -> None: ...
|
||
|
|
|
||
|
|
class HierarchicalBlock:
|
||
|
|
text: str
|
||
|
|
font_size: float
|
||
|
|
level: str
|
||
|
|
bbox: list[float] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
text: str,
|
||
|
|
font_size: float,
|
||
|
|
level: str,
|
||
|
|
bbox: list[float] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class CellChange:
|
||
|
|
row: int
|
||
|
|
col: int
|
||
|
|
from_: str
|
||
|
|
to: str
|
||
|
|
def __init__(self, row: int, col: int, from_: str, to: str) -> None: ...
|
||
|
|
|
||
|
|
class DocumentRevision:
|
||
|
|
revision_id: str
|
||
|
|
author: str | None
|
||
|
|
timestamp: str | None
|
||
|
|
kind: RevisionKind
|
||
|
|
anchor: RevisionAnchor | None
|
||
|
|
delta: RevisionDelta
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
revision_id: str,
|
||
|
|
kind: RevisionKind | str,
|
||
|
|
delta: RevisionDelta,
|
||
|
|
author: str | None = None,
|
||
|
|
timestamp: str | None = None,
|
||
|
|
anchor: RevisionAnchor | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class RevisionDelta:
|
||
|
|
content: list[DiffLine]
|
||
|
|
table_changes: list[CellChange]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: list[DiffLine] | None = None,
|
||
|
|
table_changes: list[CellChange] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class Table:
|
||
|
|
cells: list[list[str]]
|
||
|
|
markdown: str
|
||
|
|
page_number: int
|
||
|
|
bounding_box: BoundingBox | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
cells: list[list[str]] | None = None,
|
||
|
|
markdown: str | None = None,
|
||
|
|
page_number: int | None = None,
|
||
|
|
bounding_box: BoundingBox | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class TableCell:
|
||
|
|
content: str
|
||
|
|
row_span: int
|
||
|
|
col_span: int
|
||
|
|
is_header: bool
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content: str | None = None,
|
||
|
|
row_span: int | None = None,
|
||
|
|
col_span: int | None = None,
|
||
|
|
is_header: bool | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ExtractedUri:
|
||
|
|
url: str
|
||
|
|
label: str | None
|
||
|
|
page: int | None
|
||
|
|
kind: UriKind
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
kind: UriKind | str,
|
||
|
|
label: str | None = None,
|
||
|
|
page: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DetectResponse:
|
||
|
|
mime_type: str
|
||
|
|
filename: str | None
|
||
|
|
def __init__(self, mime_type: str, filename: str | None = None) -> None: ...
|
||
|
|
|
||
|
|
class DiffOptions:
|
||
|
|
include_metadata: bool
|
||
|
|
include_embedded: bool
|
||
|
|
max_content_chars: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
include_metadata: bool | None = None,
|
||
|
|
include_embedded: bool | None = None,
|
||
|
|
max_content_chars: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> DiffOptions: ...
|
||
|
|
|
||
|
|
class ExtractionDiff:
|
||
|
|
content_diff: list[DiffHunk]
|
||
|
|
tables_added: list[Table]
|
||
|
|
tables_removed: list[Table]
|
||
|
|
tables_changed: list[TableDiff]
|
||
|
|
metadata_changed: dict[str, Any]
|
||
|
|
embedded_changes: EmbeddedChanges
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
content_diff: list[DiffHunk],
|
||
|
|
tables_added: list[Table],
|
||
|
|
tables_removed: list[Table],
|
||
|
|
tables_changed: list[TableDiff],
|
||
|
|
metadata_changed: dict[str, Any],
|
||
|
|
embedded_changes: EmbeddedChanges,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class DiffHunk:
|
||
|
|
from_line: int
|
||
|
|
from_count: int
|
||
|
|
to_line: int
|
||
|
|
to_count: int
|
||
|
|
lines: list[DiffLine]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
from_line: int,
|
||
|
|
from_count: int,
|
||
|
|
to_line: int,
|
||
|
|
to_count: int,
|
||
|
|
lines: list[DiffLine],
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class TableDiff:
|
||
|
|
from_index: int
|
||
|
|
to_index: int
|
||
|
|
cell_changes: list[CellChange]
|
||
|
|
def __init__(self, from_index: int, to_index: int, cell_changes: list[CellChange]) -> None: ...
|
||
|
|
|
||
|
|
class EmbeddedChanges:
|
||
|
|
added: list[ArchiveEntry]
|
||
|
|
removed: list[ArchiveEntry]
|
||
|
|
changed: list[EmbeddedDiff]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
added: list[ArchiveEntry],
|
||
|
|
removed: list[ArchiveEntry],
|
||
|
|
changed: list[EmbeddedDiff],
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class EmbeddedDiff:
|
||
|
|
path: str
|
||
|
|
diff: ExtractionDiff
|
||
|
|
def __init__(self, path: str, diff: ExtractionDiff) -> None: ...
|
||
|
|
|
||
|
|
class EmbeddingPreset:
|
||
|
|
name: str
|
||
|
|
chunk_size: int
|
||
|
|
overlap: int
|
||
|
|
model_repo: str
|
||
|
|
pooling: str
|
||
|
|
model_file: str
|
||
|
|
dimensions: int
|
||
|
|
description: str
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
name: str,
|
||
|
|
chunk_size: int,
|
||
|
|
overlap: int,
|
||
|
|
model_repo: str,
|
||
|
|
pooling: str,
|
||
|
|
model_file: str,
|
||
|
|
dimensions: int,
|
||
|
|
description: str,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class YakeParams:
|
||
|
|
window_size: int
|
||
|
|
def __init__(self, window_size: int | None = None) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> YakeParams: ...
|
||
|
|
|
||
|
|
class RakeParams:
|
||
|
|
min_word_length: int
|
||
|
|
max_words_per_phrase: int
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
min_word_length: int | None = None,
|
||
|
|
max_words_per_phrase: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> RakeParams: ...
|
||
|
|
|
||
|
|
class KeywordConfig:
|
||
|
|
algorithm: KeywordAlgorithm
|
||
|
|
max_keywords: int
|
||
|
|
min_score: float
|
||
|
|
ngram_range: list[int]
|
||
|
|
language: str | None
|
||
|
|
yake_params: YakeParams | None
|
||
|
|
rake_params: RakeParams | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
algorithm: KeywordAlgorithm | str | None = None,
|
||
|
|
max_keywords: int | None = None,
|
||
|
|
min_score: float | None = None,
|
||
|
|
ngram_range: list[int] | None = None,
|
||
|
|
language: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> KeywordConfig: ...
|
||
|
|
|
||
|
|
class Keyword:
|
||
|
|
text: str
|
||
|
|
score: float
|
||
|
|
algorithm: KeywordAlgorithm
|
||
|
|
positions: list[int] | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
text: str,
|
||
|
|
score: float,
|
||
|
|
algorithm: KeywordAlgorithm | str,
|
||
|
|
positions: list[int] | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PaddleOcrConfig:
|
||
|
|
language: str
|
||
|
|
cache_dir: str | None
|
||
|
|
use_angle_cls: bool
|
||
|
|
enable_table_detection: bool
|
||
|
|
det_db_thresh: float
|
||
|
|
det_db_box_thresh: float
|
||
|
|
det_db_unclip_ratio: float
|
||
|
|
det_limit_side_len: int
|
||
|
|
rec_batch_num: int
|
||
|
|
padding: int
|
||
|
|
drop_score: float
|
||
|
|
model_tier: str
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
language: str | None = None,
|
||
|
|
cache_dir: str | None = None,
|
||
|
|
use_angle_cls: bool | None = None,
|
||
|
|
enable_table_detection: bool | None = None,
|
||
|
|
det_db_thresh: float | None = None,
|
||
|
|
det_db_box_thresh: float | None = None,
|
||
|
|
det_db_unclip_ratio: float | None = None,
|
||
|
|
det_limit_side_len: int | None = None,
|
||
|
|
rec_batch_num: int | None = None,
|
||
|
|
padding: int | None = None,
|
||
|
|
drop_score: float | None = None,
|
||
|
|
model_tier: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
def with_cache_dir(self, path: str) -> PaddleOcrConfig: ...
|
||
|
|
def with_table_detection(self, enable: bool) -> PaddleOcrConfig: ...
|
||
|
|
def with_angle_cls(self, enable: bool) -> PaddleOcrConfig: ...
|
||
|
|
def with_det_db_thresh(self, threshold: float) -> PaddleOcrConfig: ...
|
||
|
|
def with_det_db_box_thresh(self, threshold: float) -> PaddleOcrConfig: ...
|
||
|
|
def with_det_db_unclip_ratio(self, ratio: float) -> PaddleOcrConfig: ...
|
||
|
|
def with_det_limit_side_len(self, length: int) -> PaddleOcrConfig: ...
|
||
|
|
def with_rec_batch_num(self, batch_size: int) -> PaddleOcrConfig: ...
|
||
|
|
def with_drop_score(self, score: float) -> PaddleOcrConfig: ...
|
||
|
|
def with_padding(self, padding: int) -> PaddleOcrConfig: ...
|
||
|
|
def with_model_tier(self, tier: str) -> PaddleOcrConfig: ...
|
||
|
|
@staticmethod
|
||
|
|
def default() -> PaddleOcrConfig: ...
|
||
|
|
|
||
|
|
class ModelPaths:
|
||
|
|
det_model: str
|
||
|
|
cls_model: str
|
||
|
|
rec_model: str
|
||
|
|
dict_file: str
|
||
|
|
def __init__(self, det_model: str, cls_model: str, rec_model: str, dict_file: str) -> None: ...
|
||
|
|
|
||
|
|
class OrientationResult:
|
||
|
|
degrees: int
|
||
|
|
confidence: float
|
||
|
|
def __init__(self, degrees: int, confidence: float) -> None: ...
|
||
|
|
|
||
|
|
class BBox:
|
||
|
|
x1: float
|
||
|
|
y1: float
|
||
|
|
x2: float
|
||
|
|
y2: float
|
||
|
|
def __init__(self, x1: float, y1: float, x2: float, y2: float) -> None: ...
|
||
|
|
|
||
|
|
class LayoutDetection:
|
||
|
|
class_name: LayoutClass
|
||
|
|
confidence: float
|
||
|
|
bbox: BBox
|
||
|
|
def __init__(self, class_name: LayoutClass | str, confidence: float, bbox: BBox) -> None: ...
|
||
|
|
|
||
|
|
class RecognizedTable:
|
||
|
|
detection_bbox: BBox
|
||
|
|
cells: list[list[str]]
|
||
|
|
markdown: str
|
||
|
|
def __init__(self, detection_bbox: BBox, cells: list[list[str]], markdown: str) -> None: ...
|
||
|
|
|
||
|
|
class DetectionResult:
|
||
|
|
page_width: int
|
||
|
|
page_height: int
|
||
|
|
detections: list[LayoutDetection]
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
page_width: int,
|
||
|
|
page_height: int,
|
||
|
|
detections: list[LayoutDetection],
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class EmbeddedFile:
|
||
|
|
name: str
|
||
|
|
data: bytes
|
||
|
|
compressed_size: int
|
||
|
|
mime_type: str | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
name: str,
|
||
|
|
data: bytes,
|
||
|
|
compressed_size: int,
|
||
|
|
mime_type: str | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class PdfMetadata:
|
||
|
|
pdf_version: str | None
|
||
|
|
producer: str | None
|
||
|
|
is_encrypted: bool | None
|
||
|
|
width: int | None
|
||
|
|
height: int | None
|
||
|
|
page_count: int | None
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
pdf_version: str | None = None,
|
||
|
|
producer: str | None = None,
|
||
|
|
is_encrypted: bool | None = None,
|
||
|
|
width: int | None = None,
|
||
|
|
height: int | None = None,
|
||
|
|
page_count: int | None = None,
|
||
|
|
) -> None: ...
|
||
|
|
|
||
|
|
class ExecutionProviderType:
|
||
|
|
auto: ExecutionProviderType = ...
|
||
|
|
cpu: ExecutionProviderType = ...
|
||
|
|
core_ml: ExecutionProviderType = ...
|
||
|
|
cuda: ExecutionProviderType = ...
|
||
|
|
tensor_rt: ExecutionProviderType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class OutputFormatPlainVariant(TypedDict):
|
||
|
|
type: Literal["plain"]
|
||
|
|
|
||
|
|
class OutputFormatMarkdownVariant(TypedDict):
|
||
|
|
type: Literal["markdown"]
|
||
|
|
|
||
|
|
class OutputFormatDjotVariant(TypedDict):
|
||
|
|
type: Literal["djot"]
|
||
|
|
|
||
|
|
class OutputFormatHtmlVariant(TypedDict):
|
||
|
|
type: Literal["html"]
|
||
|
|
|
||
|
|
class OutputFormatJsonVariant(TypedDict):
|
||
|
|
type: Literal["json"]
|
||
|
|
|
||
|
|
class OutputFormatStructuredVariant(TypedDict):
|
||
|
|
type: Literal["structured"]
|
||
|
|
|
||
|
|
class OutputFormatCustomVariant(TypedDict):
|
||
|
|
type: Literal["custom"]
|
||
|
|
_0: str
|
||
|
|
|
||
|
|
class OutputFormat:
|
||
|
|
type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class HtmlTheme:
|
||
|
|
default: HtmlTheme = ...
|
||
|
|
git_hub: HtmlTheme = ...
|
||
|
|
dark: HtmlTheme = ...
|
||
|
|
light: HtmlTheme = ...
|
||
|
|
unstyled: HtmlTheme = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class TableModel:
|
||
|
|
tatr: TableModel = ...
|
||
|
|
slanet_wired: TableModel = ...
|
||
|
|
slanet_wireless: TableModel = ...
|
||
|
|
slanet_plus: TableModel = ...
|
||
|
|
slanet_auto: TableModel = ...
|
||
|
|
disabled: TableModel = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ChunkerType:
|
||
|
|
text: ChunkerType = ...
|
||
|
|
markdown: ChunkerType = ...
|
||
|
|
yaml: ChunkerType = ...
|
||
|
|
semantic: ChunkerType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ChunkSizingCharactersVariant(TypedDict):
|
||
|
|
type: Literal["characters"]
|
||
|
|
|
||
|
|
class ChunkSizingTokenizerVariant(TypedDict):
|
||
|
|
type: Literal["tokenizer"]
|
||
|
|
model: str
|
||
|
|
cache_dir: str | None
|
||
|
|
|
||
|
|
class ChunkSizing:
|
||
|
|
type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class EmbeddingModelTypePresetVariant(TypedDict):
|
||
|
|
type: Literal["preset"]
|
||
|
|
name: str
|
||
|
|
|
||
|
|
class EmbeddingModelTypeCustomVariant(TypedDict):
|
||
|
|
type: Literal["custom"]
|
||
|
|
model_id: str
|
||
|
|
dimensions: int
|
||
|
|
|
||
|
|
class EmbeddingModelTypeLlmVariant(TypedDict):
|
||
|
|
type: Literal["llm"]
|
||
|
|
llm: LlmConfig
|
||
|
|
|
||
|
|
class EmbeddingModelTypePluginVariant(TypedDict):
|
||
|
|
type: Literal["plugin"]
|
||
|
|
name: str
|
||
|
|
|
||
|
|
class EmbeddingModelType:
|
||
|
|
type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class CodeContentMode:
|
||
|
|
chunks: CodeContentMode = ...
|
||
|
|
raw: CodeContentMode = ...
|
||
|
|
structure: CodeContentMode = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ListType:
|
||
|
|
bullet: ListType = ...
|
||
|
|
numbered: ListType = ...
|
||
|
|
lettered: ListType = ...
|
||
|
|
indented: ListType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class OcrBackendType:
|
||
|
|
tesseract: OcrBackendType = ...
|
||
|
|
easy_ocr: OcrBackendType = ...
|
||
|
|
paddle_ocr: OcrBackendType = ...
|
||
|
|
custom: OcrBackendType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ProcessingStage:
|
||
|
|
early: ProcessingStage = ...
|
||
|
|
middle: ProcessingStage = ...
|
||
|
|
late: ProcessingStage = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ReductionLevel:
|
||
|
|
off: ReductionLevel = ...
|
||
|
|
light: ReductionLevel = ...
|
||
|
|
moderate: ReductionLevel = ...
|
||
|
|
aggressive: ReductionLevel = ...
|
||
|
|
maximum: ReductionLevel = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class PdfAnnotationType:
|
||
|
|
text: PdfAnnotationType = ...
|
||
|
|
highlight: PdfAnnotationType = ...
|
||
|
|
link: PdfAnnotationType = ...
|
||
|
|
stamp: PdfAnnotationType = ...
|
||
|
|
underline: PdfAnnotationType = ...
|
||
|
|
strike_out: PdfAnnotationType = ...
|
||
|
|
other: PdfAnnotationType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class BlockType:
|
||
|
|
paragraph: BlockType = ...
|
||
|
|
heading: BlockType = ...
|
||
|
|
blockquote: BlockType = ...
|
||
|
|
code_block: BlockType = ...
|
||
|
|
list_item: BlockType = ...
|
||
|
|
ordered_list: BlockType = ...
|
||
|
|
bullet_list: BlockType = ...
|
||
|
|
task_list: BlockType = ...
|
||
|
|
definition_list: BlockType = ...
|
||
|
|
definition_term: BlockType = ...
|
||
|
|
definition_description: BlockType = ...
|
||
|
|
div: BlockType = ...
|
||
|
|
section: BlockType = ...
|
||
|
|
thematic_break: BlockType = ...
|
||
|
|
raw_block: BlockType = ...
|
||
|
|
math_display: BlockType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class InlineType:
|
||
|
|
text: InlineType = ...
|
||
|
|
strong: InlineType = ...
|
||
|
|
emphasis: InlineType = ...
|
||
|
|
highlight: InlineType = ...
|
||
|
|
subscript: InlineType = ...
|
||
|
|
superscript: InlineType = ...
|
||
|
|
insert: InlineType = ...
|
||
|
|
delete: InlineType = ...
|
||
|
|
code: InlineType = ...
|
||
|
|
link: InlineType = ...
|
||
|
|
image: InlineType = ...
|
||
|
|
span: InlineType = ...
|
||
|
|
math: InlineType = ...
|
||
|
|
raw_inline: InlineType = ...
|
||
|
|
footnote_ref: InlineType = ...
|
||
|
|
symbol: InlineType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class RelationshipKind:
|
||
|
|
footnote_reference: RelationshipKind = ...
|
||
|
|
citation_reference: RelationshipKind = ...
|
||
|
|
internal_link: RelationshipKind = ...
|
||
|
|
caption: RelationshipKind = ...
|
||
|
|
label: RelationshipKind = ...
|
||
|
|
toc_entry: RelationshipKind = ...
|
||
|
|
cross_reference: RelationshipKind = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ContentLayer:
|
||
|
|
body: ContentLayer = ...
|
||
|
|
header: ContentLayer = ...
|
||
|
|
footer: ContentLayer = ...
|
||
|
|
footnote: ContentLayer = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class NodeContentTitleVariant(TypedDict):
|
||
|
|
node_type: Literal["title"]
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentHeadingVariant(TypedDict):
|
||
|
|
node_type: Literal["heading"]
|
||
|
|
level: int
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentParagraphVariant(TypedDict):
|
||
|
|
node_type: Literal["paragraph"]
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentListVariant(TypedDict):
|
||
|
|
node_type: Literal["list"]
|
||
|
|
ordered: bool
|
||
|
|
|
||
|
|
class NodeContentListItemVariant(TypedDict):
|
||
|
|
node_type: Literal["list_item"]
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentTableVariant(TypedDict):
|
||
|
|
node_type: Literal["table"]
|
||
|
|
grid: TableGrid
|
||
|
|
|
||
|
|
class NodeContentImageVariant(TypedDict):
|
||
|
|
node_type: Literal["image"]
|
||
|
|
description: str | None
|
||
|
|
image_index: int | None
|
||
|
|
src: str | None
|
||
|
|
|
||
|
|
class NodeContentCodeVariant(TypedDict):
|
||
|
|
node_type: Literal["code"]
|
||
|
|
text: str
|
||
|
|
language: str | None
|
||
|
|
|
||
|
|
class NodeContentQuoteVariant(TypedDict):
|
||
|
|
node_type: Literal["quote"]
|
||
|
|
|
||
|
|
class NodeContentFormulaVariant(TypedDict):
|
||
|
|
node_type: Literal["formula"]
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentFootnoteVariant(TypedDict):
|
||
|
|
node_type: Literal["footnote"]
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentGroupVariant(TypedDict):
|
||
|
|
node_type: Literal["group"]
|
||
|
|
label: str | None
|
||
|
|
heading_level: int | None
|
||
|
|
heading_text: str | None
|
||
|
|
|
||
|
|
class NodeContentPageBreakVariant(TypedDict):
|
||
|
|
node_type: Literal["page_break"]
|
||
|
|
|
||
|
|
class NodeContentSlideVariant(TypedDict):
|
||
|
|
node_type: Literal["slide"]
|
||
|
|
number: int
|
||
|
|
title: str | None
|
||
|
|
|
||
|
|
class NodeContentDefinitionListVariant(TypedDict):
|
||
|
|
node_type: Literal["definition_list"]
|
||
|
|
|
||
|
|
class NodeContentDefinitionItemVariant(TypedDict):
|
||
|
|
node_type: Literal["definition_item"]
|
||
|
|
term: str
|
||
|
|
definition: str
|
||
|
|
|
||
|
|
class NodeContentCitationVariant(TypedDict):
|
||
|
|
node_type: Literal["citation"]
|
||
|
|
key: str
|
||
|
|
text: str
|
||
|
|
|
||
|
|
class NodeContentAdmonitionVariant(TypedDict):
|
||
|
|
node_type: Literal["admonition"]
|
||
|
|
kind: str
|
||
|
|
title: str | None
|
||
|
|
|
||
|
|
class NodeContentRawBlockVariant(TypedDict):
|
||
|
|
node_type: Literal["raw_block"]
|
||
|
|
format: str
|
||
|
|
content: str
|
||
|
|
|
||
|
|
class NodeContentMetadataBlockVariant(TypedDict):
|
||
|
|
node_type: Literal["metadata_block"]
|
||
|
|
entries: list[list[str]]
|
||
|
|
|
||
|
|
class NodeContent:
|
||
|
|
node_type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class AnnotationKindBoldVariant(TypedDict):
|
||
|
|
annotation_type: Literal["bold"]
|
||
|
|
|
||
|
|
class AnnotationKindItalicVariant(TypedDict):
|
||
|
|
annotation_type: Literal["italic"]
|
||
|
|
|
||
|
|
class AnnotationKindUnderlineVariant(TypedDict):
|
||
|
|
annotation_type: Literal["underline"]
|
||
|
|
|
||
|
|
class AnnotationKindStrikethroughVariant(TypedDict):
|
||
|
|
annotation_type: Literal["strikethrough"]
|
||
|
|
|
||
|
|
class AnnotationKindCodeVariant(TypedDict):
|
||
|
|
annotation_type: Literal["code"]
|
||
|
|
|
||
|
|
class AnnotationKindSubscriptVariant(TypedDict):
|
||
|
|
annotation_type: Literal["subscript"]
|
||
|
|
|
||
|
|
class AnnotationKindSuperscriptVariant(TypedDict):
|
||
|
|
annotation_type: Literal["superscript"]
|
||
|
|
|
||
|
|
class AnnotationKindLinkVariant(TypedDict):
|
||
|
|
annotation_type: Literal["link"]
|
||
|
|
url: str
|
||
|
|
title: str | None
|
||
|
|
|
||
|
|
class AnnotationKindHighlightVariant(TypedDict):
|
||
|
|
annotation_type: Literal["highlight"]
|
||
|
|
|
||
|
|
class AnnotationKindColorVariant(TypedDict):
|
||
|
|
annotation_type: Literal["color"]
|
||
|
|
value: str
|
||
|
|
|
||
|
|
class AnnotationKindFontSizeVariant(TypedDict):
|
||
|
|
annotation_type: Literal["font_size"]
|
||
|
|
value: str
|
||
|
|
|
||
|
|
class AnnotationKindCustomVariant(TypedDict):
|
||
|
|
annotation_type: Literal["custom"]
|
||
|
|
name: str
|
||
|
|
value: str | None
|
||
|
|
|
||
|
|
class AnnotationKind:
|
||
|
|
annotation_type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class ExtractionMethod:
|
||
|
|
native: ExtractionMethod = ...
|
||
|
|
ocr: ExtractionMethod = ...
|
||
|
|
mixed: ExtractionMethod = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ChunkType:
|
||
|
|
heading: ChunkType = ...
|
||
|
|
party_list: ChunkType = ...
|
||
|
|
definitions: ChunkType = ...
|
||
|
|
operative_clause: ChunkType = ...
|
||
|
|
signature_block: ChunkType = ...
|
||
|
|
schedule: ChunkType = ...
|
||
|
|
table_like: ChunkType = ...
|
||
|
|
formula: ChunkType = ...
|
||
|
|
code_block: ChunkType = ...
|
||
|
|
image: ChunkType = ...
|
||
|
|
org_chart: ChunkType = ...
|
||
|
|
diagram: ChunkType = ...
|
||
|
|
unknown: ChunkType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ImageKind:
|
||
|
|
photograph: ImageKind = ...
|
||
|
|
diagram: ImageKind = ...
|
||
|
|
chart: ImageKind = ...
|
||
|
|
drawing: ImageKind = ...
|
||
|
|
text_block: ImageKind = ...
|
||
|
|
decoration: ImageKind = ...
|
||
|
|
logo: ImageKind = ...
|
||
|
|
icon: ImageKind = ...
|
||
|
|
tile_fragment: ImageKind = ...
|
||
|
|
mask: ImageKind = ...
|
||
|
|
page_raster: ImageKind = ...
|
||
|
|
unknown: ImageKind = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ResultFormat:
|
||
|
|
unified: ResultFormat = ...
|
||
|
|
element_based: ResultFormat = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ElementType:
|
||
|
|
title_: ElementType = ...
|
||
|
|
narrative_text: ElementType = ...
|
||
|
|
heading: ElementType = ...
|
||
|
|
list_item: ElementType = ...
|
||
|
|
table: ElementType = ...
|
||
|
|
image: ElementType = ...
|
||
|
|
page_break: ElementType = ...
|
||
|
|
code_block: ElementType = ...
|
||
|
|
block_quote: ElementType = ...
|
||
|
|
footer: ElementType = ...
|
||
|
|
header: ElementType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class FormatMetadataPdfVariant(TypedDict):
|
||
|
|
format_type: Literal["pdf"]
|
||
|
|
_0: PdfMetadata
|
||
|
|
|
||
|
|
class FormatMetadataDocxVariant(TypedDict):
|
||
|
|
format_type: Literal["docx"]
|
||
|
|
_0: DocxMetadata
|
||
|
|
|
||
|
|
class FormatMetadataExcelVariant(TypedDict):
|
||
|
|
format_type: Literal["excel"]
|
||
|
|
_0: ExcelMetadata
|
||
|
|
|
||
|
|
class FormatMetadataEmailVariant(TypedDict):
|
||
|
|
format_type: Literal["email"]
|
||
|
|
_0: EmailMetadata
|
||
|
|
|
||
|
|
class FormatMetadataPptxVariant(TypedDict):
|
||
|
|
format_type: Literal["pptx"]
|
||
|
|
_0: PptxMetadata
|
||
|
|
|
||
|
|
class FormatMetadataArchiveVariant(TypedDict):
|
||
|
|
format_type: Literal["archive"]
|
||
|
|
_0: ArchiveMetadata
|
||
|
|
|
||
|
|
class FormatMetadataImageVariant(TypedDict):
|
||
|
|
format_type: Literal["image"]
|
||
|
|
_0: ImageMetadata
|
||
|
|
|
||
|
|
class FormatMetadataXmlVariant(TypedDict):
|
||
|
|
format_type: Literal["xml"]
|
||
|
|
_0: XmlMetadata
|
||
|
|
|
||
|
|
class FormatMetadataTextVariant(TypedDict):
|
||
|
|
format_type: Literal["text"]
|
||
|
|
_0: TextMetadata
|
||
|
|
|
||
|
|
class FormatMetadataHtmlVariant(TypedDict):
|
||
|
|
format_type: Literal["html"]
|
||
|
|
_0: HtmlMetadata
|
||
|
|
|
||
|
|
class FormatMetadataOcrVariant(TypedDict):
|
||
|
|
format_type: Literal["ocr"]
|
||
|
|
_0: OcrMetadata
|
||
|
|
|
||
|
|
class FormatMetadataCsvVariant(TypedDict):
|
||
|
|
format_type: Literal["csv"]
|
||
|
|
_0: CsvMetadata
|
||
|
|
|
||
|
|
class FormatMetadataBibtexVariant(TypedDict):
|
||
|
|
format_type: Literal["bibtex"]
|
||
|
|
_0: BibtexMetadata
|
||
|
|
|
||
|
|
class FormatMetadataCitationVariant(TypedDict):
|
||
|
|
format_type: Literal["citation"]
|
||
|
|
_0: CitationMetadata
|
||
|
|
|
||
|
|
class FormatMetadataFictionBookVariant(TypedDict):
|
||
|
|
format_type: Literal["fiction_book"]
|
||
|
|
_0: FictionBookMetadata
|
||
|
|
|
||
|
|
class FormatMetadataDbfVariant(TypedDict):
|
||
|
|
format_type: Literal["dbf"]
|
||
|
|
_0: DbfMetadata
|
||
|
|
|
||
|
|
class FormatMetadataJatsVariant(TypedDict):
|
||
|
|
format_type: Literal["jats"]
|
||
|
|
_0: JatsMetadata
|
||
|
|
|
||
|
|
class FormatMetadataEpubVariant(TypedDict):
|
||
|
|
format_type: Literal["epub"]
|
||
|
|
_0: EpubMetadata
|
||
|
|
|
||
|
|
class FormatMetadataPstVariant(TypedDict):
|
||
|
|
format_type: Literal["pst"]
|
||
|
|
_0: PstMetadata
|
||
|
|
|
||
|
|
class FormatMetadataCodeVariant(TypedDict):
|
||
|
|
format_type: Literal["code"]
|
||
|
|
_0: str
|
||
|
|
|
||
|
|
class FormatMetadata:
|
||
|
|
format_type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class TextDirection:
|
||
|
|
left_to_right: TextDirection = ...
|
||
|
|
right_to_left: TextDirection = ...
|
||
|
|
auto: TextDirection = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class LinkType:
|
||
|
|
anchor: LinkType = ...
|
||
|
|
internal: LinkType = ...
|
||
|
|
external: LinkType = ...
|
||
|
|
email: LinkType = ...
|
||
|
|
phone: LinkType = ...
|
||
|
|
other: LinkType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class ImageType:
|
||
|
|
data_uri: ImageType = ...
|
||
|
|
inline_svg: ImageType = ...
|
||
|
|
external: ImageType = ...
|
||
|
|
relative: ImageType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class StructuredDataType:
|
||
|
|
json_ld: StructuredDataType = ...
|
||
|
|
microdata: StructuredDataType = ...
|
||
|
|
rd_fa: StructuredDataType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class OcrBoundingGeometryRectangleVariant(TypedDict):
|
||
|
|
type: Literal["rectangle"]
|
||
|
|
left: int
|
||
|
|
top: int
|
||
|
|
width: int
|
||
|
|
height: int
|
||
|
|
|
||
|
|
class OcrBoundingGeometryQuadrilateralVariant(TypedDict):
|
||
|
|
type: Literal["quadrilateral"]
|
||
|
|
points: str
|
||
|
|
|
||
|
|
class OcrBoundingGeometry:
|
||
|
|
type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class OcrElementLevel:
|
||
|
|
word: OcrElementLevel = ...
|
||
|
|
line: OcrElementLevel = ...
|
||
|
|
block: OcrElementLevel = ...
|
||
|
|
page: OcrElementLevel = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class PageUnitType:
|
||
|
|
page: PageUnitType = ...
|
||
|
|
slide: PageUnitType = ...
|
||
|
|
sheet: PageUnitType = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class DiffLineContextVariant(TypedDict):
|
||
|
|
kind: Literal["context"]
|
||
|
|
_0: str
|
||
|
|
|
||
|
|
class DiffLineAddedVariant(TypedDict):
|
||
|
|
kind: Literal["added"]
|
||
|
|
_0: str
|
||
|
|
|
||
|
|
class DiffLineRemovedVariant(TypedDict):
|
||
|
|
kind: Literal["removed"]
|
||
|
|
_0: str
|
||
|
|
|
||
|
|
class DiffLine:
|
||
|
|
kind: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class RevisionKind:
|
||
|
|
insertion: RevisionKind = ...
|
||
|
|
deletion: RevisionKind = ...
|
||
|
|
format_change: RevisionKind = ...
|
||
|
|
comment: RevisionKind = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class RevisionAnchorParagraphVariant(TypedDict):
|
||
|
|
type: Literal["paragraph"]
|
||
|
|
index: int
|
||
|
|
|
||
|
|
class RevisionAnchorTableCellVariant(TypedDict):
|
||
|
|
type: Literal["table_cell"]
|
||
|
|
row: int
|
||
|
|
col: int
|
||
|
|
table_index: int
|
||
|
|
|
||
|
|
class RevisionAnchorPageVariant(TypedDict):
|
||
|
|
type: Literal["page"]
|
||
|
|
index: int
|
||
|
|
|
||
|
|
class RevisionAnchorSlideVariant(TypedDict):
|
||
|
|
type: Literal["slide"]
|
||
|
|
index: int
|
||
|
|
|
||
|
|
class RevisionAnchorSheetVariant(TypedDict):
|
||
|
|
type: Literal["sheet"]
|
||
|
|
index: int
|
||
|
|
name: str | None
|
||
|
|
|
||
|
|
class RevisionAnchor:
|
||
|
|
type: str
|
||
|
|
def __str__(self) -> str: ... # noqa: PYI029
|
||
|
|
def __repr__(self) -> str: ... # noqa: PYI029
|
||
|
|
|
||
|
|
class UriKind:
|
||
|
|
hyperlink: UriKind = ...
|
||
|
|
image: UriKind = ...
|
||
|
|
anchor: UriKind = ...
|
||
|
|
citation: UriKind = ...
|
||
|
|
reference: UriKind = ...
|
||
|
|
email: UriKind = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class KeywordAlgorithm:
|
||
|
|
yake: KeywordAlgorithm = ...
|
||
|
|
rake: KeywordAlgorithm = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class PSMMode:
|
||
|
|
osd_only: PSMMode = ...
|
||
|
|
auto_osd: PSMMode = ...
|
||
|
|
auto_only: PSMMode = ...
|
||
|
|
auto: PSMMode = ...
|
||
|
|
single_column: PSMMode = ...
|
||
|
|
single_block_vertical: PSMMode = ...
|
||
|
|
single_block: PSMMode = ...
|
||
|
|
single_line: PSMMode = ...
|
||
|
|
single_word: PSMMode = ...
|
||
|
|
circle_word: PSMMode = ...
|
||
|
|
single_char: PSMMode = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class PaddleLanguage:
|
||
|
|
english: PaddleLanguage = ...
|
||
|
|
chinese: PaddleLanguage = ...
|
||
|
|
japanese: PaddleLanguage = ...
|
||
|
|
korean: PaddleLanguage = ...
|
||
|
|
german: PaddleLanguage = ...
|
||
|
|
french: PaddleLanguage = ...
|
||
|
|
latin: PaddleLanguage = ...
|
||
|
|
cyrillic: PaddleLanguage = ...
|
||
|
|
traditional_chinese: PaddleLanguage = ...
|
||
|
|
thai: PaddleLanguage = ...
|
||
|
|
greek: PaddleLanguage = ...
|
||
|
|
east_slavic: PaddleLanguage = ...
|
||
|
|
arabic: PaddleLanguage = ...
|
||
|
|
devanagari: PaddleLanguage = ...
|
||
|
|
tamil: PaddleLanguage = ...
|
||
|
|
telugu: PaddleLanguage = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
class LayoutClass:
|
||
|
|
caption: LayoutClass = ...
|
||
|
|
footnote: LayoutClass = ...
|
||
|
|
formula: LayoutClass = ...
|
||
|
|
list_item: LayoutClass = ...
|
||
|
|
page_footer: LayoutClass = ...
|
||
|
|
page_header: LayoutClass = ...
|
||
|
|
picture: LayoutClass = ...
|
||
|
|
section_header: LayoutClass = ...
|
||
|
|
table: LayoutClass = ...
|
||
|
|
text: LayoutClass = ...
|
||
|
|
title_: LayoutClass = ...
|
||
|
|
document_index: LayoutClass = ...
|
||
|
|
code: LayoutClass = ...
|
||
|
|
checkbox_selected: LayoutClass = ...
|
||
|
|
checkbox_unselected: LayoutClass = ...
|
||
|
|
form: LayoutClass = ...
|
||
|
|
key_value_region: LayoutClass = ...
|
||
|
|
def __init__(self, value: int | str) -> None: ...
|
||
|
|
|
||
|
|
async def extract_bytes(
|
||
|
|
content: bytes,
|
||
|
|
mime_type: str,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> ExtractionResult: ...
|
||
|
|
|
||
|
|
async def extract_file(
|
||
|
|
path: str,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
mime_type: str | None = None,
|
||
|
|
) -> ExtractionResult: ...
|
||
|
|
|
||
|
|
def extract_file_sync(
|
||
|
|
path: str,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
mime_type: str | None = None,
|
||
|
|
) -> ExtractionResult: ...
|
||
|
|
|
||
|
|
def extract_bytes_sync(
|
||
|
|
content: bytes,
|
||
|
|
mime_type: str,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> ExtractionResult: ...
|
||
|
|
|
||
|
|
def batch_extract_files_sync(
|
||
|
|
items: list[BatchFileItem],
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> list[ExtractionResult]: ...
|
||
|
|
|
||
|
|
def batch_extract_bytes_sync(
|
||
|
|
items: list[BatchBytesItem],
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> list[ExtractionResult]: ...
|
||
|
|
|
||
|
|
async def batch_extract_files(
|
||
|
|
items: list[BatchFileItem],
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> list[ExtractionResult]: ...
|
||
|
|
|
||
|
|
async def batch_extract_bytes(
|
||
|
|
items: list[BatchBytesItem],
|
||
|
|
config: ExtractionConfig,
|
||
|
|
) -> list[ExtractionResult]: ...
|
||
|
|
|
||
|
|
def detect_mime_type_from_bytes(content: bytes) -> str: ...
|
||
|
|
def get_extensions_for_mime(mime_type: str) -> list[str]: ...
|
||
|
|
def clear_embedding_backends() -> None: ...
|
||
|
|
def list_embedding_backends() -> list[str]: ...
|
||
|
|
def list_document_extractors() -> list[str]: ...
|
||
|
|
def clear_document_extractors() -> None: ...
|
||
|
|
def list_ocr_backends() -> list[str]: ...
|
||
|
|
def clear_ocr_backends() -> None: ...
|
||
|
|
def list_post_processors() -> list[str]: ...
|
||
|
|
def clear_post_processors() -> None: ...
|
||
|
|
def list_renderers() -> list[str]: ...
|
||
|
|
def clear_renderers() -> None: ...
|
||
|
|
def list_validators() -> list[str]: ...
|
||
|
|
def clear_validators() -> None: ...
|
||
|
|
def compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) -> ExtractionDiff: ...
|
||
|
|
async def embed_texts_async(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
|
||
|
|
def render_pdf_page_to_png(
|
||
|
|
pdf_bytes: bytes,
|
||
|
|
page_index: int,
|
||
|
|
dpi: int | None = None,
|
||
|
|
password: str | None = None,
|
||
|
|
) -> bytes: ...
|
||
|
|
|
||
|
|
def detect_mime_type(path: str, check_exists: bool) -> str: ...
|
||
|
|
def embed_texts(texts: list[str], config: EmbeddingConfig) -> list[list[float]]: ...
|
||
|
|
def get_embedding_preset(name: str) -> EmbeddingPreset | None: ...
|
||
|
|
def list_embedding_presets() -> list[str]: ...
|
||
|
|
def register_ocr_backend(backend: object) -> None: ...
|
||
|
|
def unregister_ocr_backend(name: str) -> None: ...
|
||
|
|
def clear_ocr_backends() -> None: ...
|
||
|
|
def register_post_processor(backend: object) -> None: ...
|
||
|
|
def unregister_post_processor(name: str) -> None: ...
|
||
|
|
def clear_post_processors() -> None: ...
|
||
|
|
def register_validator(backend: object) -> None: ...
|
||
|
|
def unregister_validator(name: str) -> None: ...
|
||
|
|
def clear_validators() -> None: ...
|
||
|
|
def register_embedding_backend(backend: object) -> None: ...
|
||
|
|
def unregister_embedding_backend(name: str) -> None: ...
|
||
|
|
def clear_embedding_backends() -> None: ...
|
||
|
|
def register_document_extractor(backend: object) -> None: ...
|
||
|
|
def unregister_document_extractor(name: str) -> None: ...
|
||
|
|
def clear_document_extractors() -> None: ...
|
||
|
|
def register_renderer(backend: object) -> None: ...
|
||
|
|
def unregister_renderer(name: str) -> None: ...
|
||
|
|
def clear_renderers() -> None: ...
|