Files
fil/packages/python/kreuzberg/api.py

1172 lines
47 KiB
Python
Raw Normal View History

2026-06-01 23:40:55 +02:00
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""Public API for conversion."""
import json
from typing import Any, TypeVar
import kreuzberg._kreuzberg as _rust
from ._kreuzberg import BatchBytesItem, BatchFileItem, EmbeddingPreset, ExtractionConfig, ExtractionDiff, ExtractionResult
from .options import AccelerationConfig, BoundingBox, ChunkingConfig, ContentFilterConfig, DiffOptions, DocumentStructure, EmailConfig, EmbeddingConfig, HierarchyConfig, HtmlOutputConfig, ImageExtractionConfig, ImagePreprocessingConfig, KeywordConfig, LanguageDetectionConfig, LayoutDetectionConfig, LlmConfig, LlmUsage, Metadata, OcrConfidence, OcrConfig, OcrElement, OcrElementConfig, OcrQualityThresholds, PageConfig, PdfConfig, PostProcessorConfig, RakeParams, SecurityLimits, Table, TesseractConfig, TokenReductionOptions, TreeSitterConfig, TreeSitterProcessConfig, YakeParams
_E = TypeVar("_E")
def _pascal_to_snake(value: str) -> str:
"""Convert PascalCase/camelCase to snake_case (AtxClosed -> atx_closed)."""
out_chars: list[str] = []
for index, ch in enumerate(value):
if ch.isupper() and index > 0 and (value[index - 1].islower() or (index + 1 < len(value) and value[index + 1].islower())):
out_chars.append("_")
out_chars.append(ch.lower())
return "".join(out_chars)
def _coerce_enum(enum_cls: type[_E], value: object) -> _E:
"""Coerce a string/alias value into the matching pyclass enum instance."""
if isinstance(value, enum_cls):
return value
if value is None:
msg = f"unknown {getattr(enum_cls, '__name__', enum_cls)!s} value: {value!r}"
raise ValueError(msg)
s = str(value).replace("-", "_").replace(" ", "_")
snake = _pascal_to_snake(s)
candidates = (
s,
s.upper(),
s.lower(),
snake,
snake.upper(),
"".join(part.capitalize() for part in s.split("_")),
"".join(part.capitalize() for part in snake.split("_")),
)
for candidate in candidates:
attr = getattr(enum_cls, candidate, None)
if isinstance(attr, enum_cls):
return attr
msg = f"unknown {getattr(enum_cls, '__name__', enum_cls)!s} value: {value!r}"
raise ValueError(msg)
def _to_rust_image_preprocessing_config(
value: ImagePreprocessingConfig | dict[str, Any] | None,
) -> _rust.ImagePreprocessingConfig | None:
"""Convert Python ImagePreprocessingConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = ImagePreprocessingConfig(**value)
if value is None:
return None
return _rust.ImagePreprocessingConfig(
target_dpi=value.target_dpi,
auto_rotate=value.auto_rotate,
deskew=value.deskew,
denoise=value.denoise,
contrast_enhance=value.contrast_enhance,
binarization_method=value.binarization_method,
invert_colors=value.invert_colors,
)
def _to_rust_tesseract_config(value: TesseractConfig | dict[str, Any] | None) -> _rust.TesseractConfig | None:
"""Convert Python TesseractConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "preprocessing" in value and value["preprocessing"] is not None:
value["preprocessing"] = _to_rust_image_preprocessing_config(value["preprocessing"])
value = TesseractConfig(**value)
if value is None:
return None
return _rust.TesseractConfig(
language=value.language,
psm=value.psm,
output_format=value.output_format,
oem=value.oem,
min_confidence=value.min_confidence,
preprocessing=_to_rust_image_preprocessing_config(value.preprocessing),
enable_table_detection=value.enable_table_detection,
table_min_confidence=value.table_min_confidence,
table_column_threshold=value.table_column_threshold,
table_row_threshold_ratio=value.table_row_threshold_ratio,
use_cache=value.use_cache,
classify_use_pre_adapted_templates=value.classify_use_pre_adapted_templates,
language_model_ngram_on=value.language_model_ngram_on,
tessedit_dont_blkrej_good_wds=value.tessedit_dont_blkrej_good_wds,
tessedit_dont_rowrej_good_wds=value.tessedit_dont_rowrej_good_wds,
tessedit_enable_dict_correction=value.tessedit_enable_dict_correction,
tessedit_char_whitelist=value.tessedit_char_whitelist,
tessedit_char_blacklist=value.tessedit_char_blacklist,
tessedit_use_primary_params_model=value.tessedit_use_primary_params_model,
textord_space_size_is_variable=value.textord_space_size_is_variable,
thresholding_method=value.thresholding_method,
)
def _to_rust_ocr_element_config(value: OcrElementConfig | dict[str, Any] | None) -> _rust.OcrElementConfig | None:
"""Convert Python OcrElementConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "min_level" in value and value["min_level"] is not None:
value["min_level"] = _coerce_enum(_rust.OcrElementLevel, value["min_level"])
value = OcrElementConfig(**value)
if value is None:
return None
return _rust.OcrElementConfig(
include_elements=value.include_elements,
**({"min_level": _coerce_enum(_rust.OcrElementLevel, value.min_level)} if value.min_level is not None else {}),
min_confidence=value.min_confidence,
build_hierarchy=value.build_hierarchy,
)
def _to_rust_ocr_quality_thresholds(
value: OcrQualityThresholds | dict[str, Any] | None,
) -> _rust.OcrQualityThresholds | None:
"""Convert Python OcrQualityThresholds to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = OcrQualityThresholds(**value)
if value is None:
return None
return _rust.OcrQualityThresholds(
min_total_non_whitespace=value.min_total_non_whitespace,
min_non_whitespace_per_page=value.min_non_whitespace_per_page,
min_meaningful_word_len=value.min_meaningful_word_len,
min_meaningful_words=value.min_meaningful_words,
min_alnum_ratio=value.min_alnum_ratio,
min_garbage_chars=value.min_garbage_chars,
max_fragmented_word_ratio=value.max_fragmented_word_ratio,
critical_fragmented_word_ratio=value.critical_fragmented_word_ratio,
min_avg_word_length=value.min_avg_word_length,
min_words_for_avg_length_check=value.min_words_for_avg_length_check,
min_consecutive_repeat_ratio=value.min_consecutive_repeat_ratio,
min_words_for_repeat_check=value.min_words_for_repeat_check,
substantive_min_chars=value.substantive_min_chars,
non_text_min_chars=value.non_text_min_chars,
alnum_ws_ratio_threshold=value.alnum_ws_ratio_threshold,
pipeline_min_quality=value.pipeline_min_quality,
)
def _to_rust_llm_config(value: LlmConfig | dict[str, Any] | None) -> _rust.LlmConfig | None:
"""Convert Python LlmConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = LlmConfig(**value)
if value is None:
return None
return _rust.LlmConfig(
model=value.model,
api_key=value.api_key,
base_url=value.base_url,
timeout_secs=value.timeout_secs,
max_retries=value.max_retries,
temperature=value.temperature,
max_tokens=value.max_tokens,
)
def _to_rust_acceleration_config(
value: AccelerationConfig | dict[str, Any] | None,
) -> _rust.AccelerationConfig | None:
"""Convert Python AccelerationConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "provider" in value and value["provider"] is not None:
value["provider"] = _coerce_enum(_rust.ExecutionProviderType, value["provider"])
value = AccelerationConfig(**value)
if value is None:
return None
return _rust.AccelerationConfig(
**({"provider": _coerce_enum(_rust.ExecutionProviderType, value.provider)} if value.provider is not None else {}),
device_id=value.device_id,
)
def _coerce_dict_ocr_config(value: dict[str, Any]) -> OcrConfig:
"""Coerce a dict into OcrConfig, converting nested types in-place."""
_struct_coercions = {
"tesseract_config": _to_rust_tesseract_config,
"element_config": _to_rust_ocr_element_config,
"quality_thresholds": _to_rust_ocr_quality_thresholds,
"vlm_config": _to_rust_llm_config,
"acceleration": _to_rust_acceleration_config,
}
for _k, _fn in _struct_coercions.items():
if _k in value and value[_k] is not None:
value[_k] = _fn(value[_k])
_data_enum_coercions = {
"output_format": _rust.OutputFormat,
}
for _k, _cls in _data_enum_coercions.items():
if _k in value and value[_k] is not None and not isinstance(value[_k], _cls):
value[_k] = _cls(value[_k])
return OcrConfig(**value)
def _to_rust_ocr_config(value: OcrConfig | dict[str, Any] | None) -> _rust.OcrConfig | None:
"""Convert Python OcrConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = _coerce_dict_ocr_config(value)
if value is None:
return None
return _rust.OcrConfig(
enabled=value.enabled,
backend=value.backend,
language=value.language,
tesseract_config=_to_rust_tesseract_config(value.tesseract_config),
output_format=(value.output_format if isinstance(value.output_format, _rust.OutputFormat) else _rust.OutputFormat(value.output_format)) if value.output_format is not None else None,
paddle_ocr_config=value.paddle_ocr_config,
backend_options=value.backend_options,
element_config=_to_rust_ocr_element_config(value.element_config),
quality_thresholds=_to_rust_ocr_quality_thresholds(value.quality_thresholds),
pipeline=value.pipeline,
auto_rotate=value.auto_rotate,
vlm_config=_to_rust_llm_config(value.vlm_config),
vlm_prompt=value.vlm_prompt,
acceleration=_to_rust_acceleration_config(value.acceleration),
tessdata_bytes=value.tessdata_bytes,
)
def _to_rust_embedding_config(value: EmbeddingConfig | dict[str, Any] | None) -> _rust.EmbeddingConfig | None:
"""Convert Python EmbeddingConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "acceleration" in value and value["acceleration"] is not None:
value["acceleration"] = _to_rust_acceleration_config(value["acceleration"])
if "model" in value and value["model"] is not None and not isinstance(value["model"], _rust.EmbeddingModelType):
value["model"] = _rust.EmbeddingModelType(value["model"])
value = EmbeddingConfig(**value)
if value is None:
return None
return _rust.EmbeddingConfig(
**({"model": value.model if isinstance(value.model, _rust.EmbeddingModelType) else _rust.EmbeddingModelType(value.model)} if value.model is not None else {}),
normalize=value.normalize,
batch_size=value.batch_size,
show_download_progress=value.show_download_progress,
cache_dir=value.cache_dir,
acceleration=_to_rust_acceleration_config(value.acceleration),
max_embed_duration_secs=value.max_embed_duration_secs,
)
def _to_rust_chunking_config(value: ChunkingConfig | dict[str, Any] | None) -> _rust.ChunkingConfig | None:
"""Convert Python ChunkingConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
# Alias serde-renamed keys back to Rust field names
if "max_chars" in value and "max_characters" not in value:
value["max_characters"] = value.pop("max_chars")
if "max_overlap" in value and "overlap" not in value:
value["overlap"] = value.pop("max_overlap")
if "chunker_type" in value and value["chunker_type"] is not None:
value["chunker_type"] = _coerce_enum(_rust.ChunkerType, value["chunker_type"])
if "embedding" in value and value["embedding"] is not None:
value["embedding"] = _to_rust_embedding_config(value["embedding"])
if "sizing" in value and value["sizing"] is not None and not isinstance(value["sizing"], _rust.ChunkSizing):
value["sizing"] = _rust.ChunkSizing(value["sizing"])
value = ChunkingConfig(**value)
if value is None:
return None
return _rust.ChunkingConfig(
max_chars=value.max_characters,
max_overlap=value.overlap,
trim=value.trim,
**({"chunker_type": _coerce_enum(_rust.ChunkerType, value.chunker_type)} if value.chunker_type is not None else {}),
embedding=_to_rust_embedding_config(value.embedding),
preset=value.preset,
**({"sizing": value.sizing if isinstance(value.sizing, _rust.ChunkSizing) else _rust.ChunkSizing(value.sizing)} if value.sizing is not None else {}),
prepend_heading_context=value.prepend_heading_context,
topic_threshold=value.topic_threshold,
)
def _to_rust_content_filter_config(
value: ContentFilterConfig | dict[str, Any] | None,
) -> _rust.ContentFilterConfig | None:
"""Convert Python ContentFilterConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = ContentFilterConfig(**value)
if value is None:
return None
return _rust.ContentFilterConfig(
include_headers=value.include_headers,
include_footers=value.include_footers,
strip_repeating_text=value.strip_repeating_text,
include_watermarks=value.include_watermarks,
)
def _to_rust_image_extraction_config(
value: ImageExtractionConfig | dict[str, Any] | None,
) -> _rust.ImageExtractionConfig | None:
"""Convert Python ImageExtractionConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = ImageExtractionConfig(**value)
if value is None:
return None
return _rust.ImageExtractionConfig(
extract_images=value.extract_images,
target_dpi=value.target_dpi,
max_image_dimension=value.max_image_dimension,
inject_placeholders=value.inject_placeholders,
auto_adjust_dpi=value.auto_adjust_dpi,
min_dpi=value.min_dpi,
max_dpi=value.max_dpi,
max_images_per_page=value.max_images_per_page,
classify=value.classify,
include_page_rasters=value.include_page_rasters,
run_ocr_on_images=value.run_ocr_on_images,
ocr_text_only=value.ocr_text_only,
append_ocr_text=value.append_ocr_text,
)
def _to_rust_hierarchy_config(value: HierarchyConfig | dict[str, Any] | None) -> _rust.HierarchyConfig | None:
"""Convert Python HierarchyConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = HierarchyConfig(**value)
if value is None:
return None
return _rust.HierarchyConfig(
enabled=value.enabled,
k_clusters=value.k_clusters,
include_bbox=value.include_bbox,
ocr_coverage_threshold=value.ocr_coverage_threshold,
)
def _to_rust_pdf_config(value: PdfConfig | dict[str, Any] | None) -> _rust.PdfConfig | None:
"""Convert Python PdfConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "hierarchy" in value and value["hierarchy"] is not None:
value["hierarchy"] = _to_rust_hierarchy_config(value["hierarchy"])
value = PdfConfig(**value)
if value is None:
return None
return _rust.PdfConfig(
extract_images=value.extract_images,
extract_tables=value.extract_tables,
passwords=value.passwords,
extract_metadata=value.extract_metadata,
hierarchy=_to_rust_hierarchy_config(value.hierarchy),
extract_annotations=value.extract_annotations,
top_margin_fraction=value.top_margin_fraction,
bottom_margin_fraction=value.bottom_margin_fraction,
allow_single_column_tables=value.allow_single_column_tables,
ocr_inline_images=value.ocr_inline_images,
)
def _to_rust_token_reduction_options(
value: TokenReductionOptions | dict[str, Any] | None,
) -> _rust.TokenReductionOptions | None:
"""Convert Python TokenReductionOptions to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = TokenReductionOptions(**value)
if value is None:
return None
return _rust.TokenReductionOptions(
mode=value.mode,
preserve_important_words=value.preserve_important_words,
)
def _to_rust_language_detection_config(
value: LanguageDetectionConfig | dict[str, Any] | None,
) -> _rust.LanguageDetectionConfig | None:
"""Convert Python LanguageDetectionConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = LanguageDetectionConfig(**value)
if value is None:
return None
return _rust.LanguageDetectionConfig(
enabled=value.enabled,
min_confidence=value.min_confidence,
detect_multiple=value.detect_multiple,
)
def _to_rust_page_config(value: PageConfig | dict[str, Any] | None) -> _rust.PageConfig | None:
"""Convert Python PageConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = PageConfig(**value)
if value is None:
return None
return _rust.PageConfig(
extract_pages=value.extract_pages,
insert_page_markers=value.insert_page_markers,
marker_format=value.marker_format,
)
def _to_rust_yake_params(value: YakeParams | dict[str, Any] | None) -> _rust.YakeParams | None:
"""Convert Python YakeParams to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = YakeParams(**value)
if value is None:
return None
return _rust.YakeParams(
window_size=value.window_size,
)
def _to_rust_rake_params(value: RakeParams | dict[str, Any] | None) -> _rust.RakeParams | None:
"""Convert Python RakeParams to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = RakeParams(**value)
if value is None:
return None
return _rust.RakeParams(
min_word_length=value.min_word_length,
max_words_per_phrase=value.max_words_per_phrase,
)
def _to_rust_keyword_config(value: KeywordConfig | dict[str, Any] | None) -> _rust.KeywordConfig | None:
"""Convert Python KeywordConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "algorithm" in value and value["algorithm"] is not None:
value["algorithm"] = _coerce_enum(_rust.KeywordAlgorithm, value["algorithm"])
if "yake_params" in value and value["yake_params"] is not None:
value["yake_params"] = _to_rust_yake_params(value["yake_params"])
if "rake_params" in value and value["rake_params"] is not None:
value["rake_params"] = _to_rust_rake_params(value["rake_params"])
value = KeywordConfig(**value)
if value is None:
return None
return _rust.KeywordConfig(
**({"algorithm": _coerce_enum(_rust.KeywordAlgorithm, value.algorithm)} if value.algorithm is not None else {}),
max_keywords=value.max_keywords,
min_score=value.min_score,
ngram_range=value.ngram_range,
language=value.language,
yake_params=_to_rust_yake_params(value.yake_params),
rake_params=_to_rust_rake_params(value.rake_params),
)
def _to_rust_post_processor_config(
value: PostProcessorConfig | dict[str, Any] | None,
) -> _rust.PostProcessorConfig | None:
"""Convert Python PostProcessorConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = PostProcessorConfig(**value)
if value is None:
return None
return _rust.PostProcessorConfig(
enabled=value.enabled,
enabled_processors=value.enabled_processors,
disabled_processors=value.disabled_processors,
enabled_set=value.enabled_set,
disabled_set=value.disabled_set,
)
def _to_rust_html_output_config(value: HtmlOutputConfig | dict[str, Any] | None) -> _rust.HtmlOutputConfig | None:
"""Convert Python HtmlOutputConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "theme" in value and value["theme"] is not None:
value["theme"] = _coerce_enum(_rust.HtmlTheme, value["theme"])
value = HtmlOutputConfig(**value)
if value is None:
return None
return _rust.HtmlOutputConfig(
css=value.css,
css_file=value.css_file,
**({"theme": _coerce_enum(_rust.HtmlTheme, value.theme)} if value.theme is not None else {}),
class_prefix=value.class_prefix,
embed_css=value.embed_css,
)
def _to_rust_security_limits(value: SecurityLimits | dict[str, Any] | None) -> _rust.SecurityLimits | None:
"""Convert Python SecurityLimits to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = SecurityLimits(**value)
if value is None:
return None
return _rust.SecurityLimits(
max_archive_size=value.max_archive_size,
max_compression_ratio=value.max_compression_ratio,
max_files_in_archive=value.max_files_in_archive,
max_nesting_depth=value.max_nesting_depth,
max_entity_length=value.max_entity_length,
max_content_size=value.max_content_size,
max_iterations=value.max_iterations,
max_xml_depth=value.max_xml_depth,
max_table_cells=value.max_table_cells,
)
def _to_rust_layout_detection_config(
value: LayoutDetectionConfig | dict[str, Any] | None,
) -> _rust.LayoutDetectionConfig | None:
"""Convert Python LayoutDetectionConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "table_model" in value and value["table_model"] is not None:
value["table_model"] = _coerce_enum(_rust.TableModel, value["table_model"])
if "acceleration" in value and value["acceleration"] is not None:
value["acceleration"] = _to_rust_acceleration_config(value["acceleration"])
value = LayoutDetectionConfig(**value)
if value is None:
return None
return _rust.LayoutDetectionConfig(
confidence_threshold=value.confidence_threshold,
apply_heuristics=value.apply_heuristics,
**({"table_model": _coerce_enum(_rust.TableModel, value.table_model)} if value.table_model is not None else {}),
acceleration=_to_rust_acceleration_config(value.acceleration),
)
def _to_rust_email_config(value: EmailConfig | dict[str, Any] | None) -> _rust.EmailConfig | None:
"""Convert Python EmailConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = EmailConfig(**value)
if value is None:
return None
return _rust.EmailConfig(
msg_fallback_codepage=value.msg_fallback_codepage,
)
def _to_rust_tree_sitter_process_config(
value: TreeSitterProcessConfig | dict[str, Any] | None,
) -> _rust.TreeSitterProcessConfig | None:
"""Convert Python TreeSitterProcessConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "content_mode" in value and value["content_mode"] is not None:
value["content_mode"] = _coerce_enum(_rust.CodeContentMode, value["content_mode"])
value = TreeSitterProcessConfig(**value)
if value is None:
return None
return _rust.TreeSitterProcessConfig(
structure=value.structure,
imports=value.imports,
exports=value.exports,
comments=value.comments,
docstrings=value.docstrings,
symbols=value.symbols,
diagnostics=value.diagnostics,
chunk_max_size=value.chunk_max_size,
**({"content_mode": _coerce_enum(_rust.CodeContentMode, value.content_mode)} if value.content_mode is not None else {}),
)
def _to_rust_tree_sitter_config(value: TreeSitterConfig | dict[str, Any] | None) -> _rust.TreeSitterConfig | None:
"""Convert Python TreeSitterConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "process" in value and value["process"] is not None:
value["process"] = _to_rust_tree_sitter_process_config(value["process"])
value = TreeSitterConfig(**value)
if value is None:
return None
return _rust.TreeSitterConfig(
enabled=value.enabled,
cache_dir=value.cache_dir,
languages=value.languages,
groups=value.groups,
process=_to_rust_tree_sitter_process_config(value.process),
)
def _coerce_dict_extraction_config(value: dict[str, Any]) -> ExtractionConfig:
"""Coerce a dict into ExtractionConfig, converting nested types in-place."""
_struct_coercions = {
"ocr": _to_rust_ocr_config,
"chunking": _to_rust_chunking_config,
"content_filter": _to_rust_content_filter_config,
"images": _to_rust_image_extraction_config,
"pdf_options": _to_rust_pdf_config,
"token_reduction": _to_rust_token_reduction_options,
"language_detection": _to_rust_language_detection_config,
"pages": _to_rust_page_config,
"keywords": _to_rust_keyword_config,
"postprocessor": _to_rust_post_processor_config,
"html_output": _to_rust_html_output_config,
"security_limits": _to_rust_security_limits,
"layout": _to_rust_layout_detection_config,
"acceleration": _to_rust_acceleration_config,
"email": _to_rust_email_config,
"tree_sitter": _to_rust_tree_sitter_config,
}
for _k, _fn in _struct_coercions.items():
if _k in value and value[_k] is not None:
value[_k] = _fn(value[_k])
_enum_coercions = {
"result_format": _rust.ResultFormat,
}
for _k, _cls in _enum_coercions.items():
if _k in value and value[_k] is not None:
value[_k] = _coerce_enum(_cls, value[_k])
_data_enum_coercions = {
"output_format": _rust.OutputFormat,
}
for _k, _cls in _data_enum_coercions.items():
if _k in value and value[_k] is not None and not isinstance(value[_k], _cls):
value[_k] = _cls(value[_k])
return ExtractionConfig(**value)
def _to_rust_extraction_config(value: ExtractionConfig | dict[str, Any] | None) -> _rust.ExtractionConfig | None:
"""Convert Python ExtractionConfig to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = _coerce_dict_extraction_config(value)
if value is None:
return None
return _rust.ExtractionConfig(
use_cache=value.use_cache,
enable_quality_processing=value.enable_quality_processing,
ocr=_to_rust_ocr_config(value.ocr),
force_ocr=value.force_ocr,
force_ocr_pages=value.force_ocr_pages,
disable_ocr=value.disable_ocr,
chunking=_to_rust_chunking_config(value.chunking),
content_filter=_to_rust_content_filter_config(value.content_filter),
images=_to_rust_image_extraction_config(value.images),
pdf_options=_to_rust_pdf_config(value.pdf_options),
token_reduction=_to_rust_token_reduction_options(value.token_reduction),
language_detection=_to_rust_language_detection_config(value.language_detection),
pages=_to_rust_page_config(value.pages),
keywords=_to_rust_keyword_config(value.keywords),
postprocessor=_to_rust_post_processor_config(value.postprocessor),
html_options=value.html_options,
html_output=_to_rust_html_output_config(value.html_output),
extraction_timeout_secs=value.extraction_timeout_secs,
max_concurrent_extractions=value.max_concurrent_extractions,
**({"result_format": _coerce_enum(_rust.ResultFormat, value.result_format)} if value.result_format is not None else {}),
security_limits=_to_rust_security_limits(value.security_limits),
max_embedded_file_bytes=value.max_embedded_file_bytes,
**({"output_format": value.output_format if isinstance(value.output_format, _rust.OutputFormat) else _rust.OutputFormat(value.output_format)} if value.output_format is not None else {}),
layout=_to_rust_layout_detection_config(value.layout),
use_layout_for_markdown=value.use_layout_for_markdown,
include_document_structure=value.include_document_structure,
acceleration=_to_rust_acceleration_config(value.acceleration),
cache_namespace=value.cache_namespace,
cache_ttl_secs=value.cache_ttl_secs,
email=_to_rust_email_config(value.email),
concurrency=value.concurrency,
max_archive_depth=value.max_archive_depth,
tree_sitter=_to_rust_tree_sitter_config(value.tree_sitter),
structured_extraction=value.structured_extraction,
cancel_token=value.cancel_token,
)
def _to_rust_metadata(value: Metadata | dict[str, Any] | None) -> _rust.Metadata | None:
"""Convert Python Metadata to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "format" in value and value["format"] is not None and not isinstance(value["format"], _rust.FormatMetadata):
value["format"] = _rust.FormatMetadata(value["format"])
value = Metadata(**value)
if value is None:
return None
return _rust.Metadata(
title=value.title,
subject=value.subject,
authors=value.authors,
keywords=value.keywords,
language=value.language,
created_at=value.created_at,
modified_at=value.modified_at,
created_by=value.created_by,
modified_by=value.modified_by,
pages=value.pages,
format=(value.format if isinstance(value.format, _rust.FormatMetadata) else _rust.FormatMetadata(value.format)) if value.format is not None else None,
image_preprocessing=value.image_preprocessing,
json_schema=value.json_schema,
error=value.error,
extraction_duration_ms=value.extraction_duration_ms,
category=value.category,
tags=value.tags,
document_version=value.document_version,
abstract_text=value.abstract_text,
output_format=value.output_format,
ocr_used=value.ocr_used,
additional=value.additional,
)
def _to_rust_bounding_box(value: BoundingBox | dict[str, Any] | None) -> _rust.BoundingBox | None:
"""Convert Python BoundingBox to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = BoundingBox(**value)
if value is None:
return None
return _rust.BoundingBox(
x0=value.x0,
y0=value.y0,
x1=value.x1,
y1=value.y1,
)
def _to_rust_table(value: Table | dict[str, Any] | None) -> _rust.Table | None:
"""Convert Python Table to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "bounding_box" in value and value["bounding_box"] is not None:
value["bounding_box"] = _to_rust_bounding_box(value["bounding_box"])
value = Table(**value)
if value is None:
return None
return _rust.Table(
cells=value.cells,
markdown=value.markdown,
page_number=value.page_number,
bounding_box=_to_rust_bounding_box(value.bounding_box),
)
def _to_rust_ocr_confidence(value: OcrConfidence | dict[str, Any] | None) -> _rust.OcrConfidence | None:
"""Convert Python OcrConfidence to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = OcrConfidence(**value)
if value is None:
return None
return _rust.OcrConfidence(
detection=value.detection,
recognition=value.recognition,
)
def _to_rust_ocr_element(value: OcrElement | dict[str, Any] | None) -> _rust.OcrElement | None:
"""Convert Python OcrElement to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "level" in value and value["level"] is not None:
value["level"] = _coerce_enum(_rust.OcrElementLevel, value["level"])
if "confidence" in value and value["confidence"] is not None:
value["confidence"] = _to_rust_ocr_confidence(value["confidence"])
if "geometry" in value and value["geometry"] is not None and not isinstance(value["geometry"], _rust.OcrBoundingGeometry):
value["geometry"] = _rust.OcrBoundingGeometry(value["geometry"])
value = OcrElement(**value)
if value is None:
return None
return _rust.OcrElement(
text=value.text,
geometry=value.geometry if isinstance(value.geometry, _rust.OcrBoundingGeometry) else _rust.OcrBoundingGeometry(value.geometry),
confidence=_to_rust_ocr_confidence(value.confidence),
**({"level": _coerce_enum(_rust.OcrElementLevel, value.level)} if value.level is not None else {}),
rotation=value.rotation,
page_number=value.page_number,
parent_id=value.parent_id,
backend_metadata=value.backend_metadata,
)
def _to_rust_document_structure(value: DocumentStructure | dict[str, Any] | None) -> _rust.DocumentStructure | None:
"""Convert Python DocumentStructure to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = DocumentStructure(**value)
if value is None:
return None
return _rust.DocumentStructure(
nodes=value.nodes,
source_format=value.source_format,
relationships=value.relationships,
node_types=value.node_types,
)
def _to_rust_llm_usage(value: LlmUsage | dict[str, Any] | None) -> _rust.LlmUsage | None:
"""Convert Python LlmUsage to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = LlmUsage(**value)
if value is None:
return None
return _rust.LlmUsage(
model=value.model,
source=value.source,
input_tokens=value.input_tokens,
output_tokens=value.output_tokens,
total_tokens=value.total_tokens,
estimated_cost=value.estimated_cost,
finish_reason=value.finish_reason,
)
def _to_rust_extraction_result(value: ExtractionResult | dict[str, Any] | None) -> _rust.ExtractionResult | None:
"""Convert Python ExtractionResult to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
if "extraction_method" in value and value["extraction_method"] is not None:
value["extraction_method"] = _coerce_enum(_rust.ExtractionMethod, value["extraction_method"])
if "metadata" in value and value["metadata"] is not None:
value["metadata"] = _to_rust_metadata(value["metadata"])
if "document" in value and value["document"] is not None:
value["document"] = _to_rust_document_structure(value["document"])
value = ExtractionResult(**value)
if value is None:
return None
return _rust.ExtractionResult(
content=value.content,
mime_type=value.mime_type,
metadata=_to_rust_metadata(value.metadata),
extraction_method=_coerce_enum(_rust.ExtractionMethod, value.extraction_method),
tables=value.tables,
detected_languages=value.detected_languages,
chunks=value.chunks,
images=value.images,
pages=value.pages,
elements=value.elements,
djot_content=value.djot_content,
ocr_elements=value.ocr_elements,
document=_to_rust_document_structure(value.document),
extracted_keywords=value.extracted_keywords,
quality_score=value.quality_score,
processing_warnings=value.processing_warnings,
annotations=value.annotations,
children=value.children,
uris=value.uris,
revisions=value.revisions,
structured_output=value.structured_output,
code_intelligence=value.code_intelligence,
llm_usage=value.llm_usage,
formatted_content=value.formatted_content,
ocr_internal_document=value.ocr_internal_document,
)
def _to_rust_diff_options(value: DiffOptions | dict[str, Any] | None) -> _rust.DiffOptions | None:
"""Convert Python DiffOptions to Rust binding type."""
if isinstance(value, str):
value = json.loads(value)
if isinstance(value, dict):
value = DiffOptions(**value)
if value is None:
return None
return _rust.DiffOptions(
include_metadata=value.include_metadata,
include_embedded=value.include_embedded,
max_content_chars=value.max_content_chars,
)
async def extract_bytes(
content: bytes,
mime_type: str,
config: ExtractionConfig | None = None,
) -> ExtractionResult:
"""Extract content from a byte array."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return await _rust.extract_bytes(content=content, mime_type=mime_type, config=_rust_config)
async def extract_file(
path: str,
mime_type: str | None = None,
config: ExtractionConfig | None = None,
) -> ExtractionResult:
"""Extract content from a file."""
_rust_config = _to_rust_extraction_config(config) if config is not None else _rust.ExtractionConfig()
return await _rust.extract_file(path=path, mime_type=mime_type, config=_rust_config)
def extract_file_sync(
path: str,
mime_type: str | None = None,
config: ExtractionConfig | None = None,
) -> ExtractionResult:
"""Synchronous wrapper for `extract_file`."""
_rust_config = _to_rust_extraction_config(config) if config is not None else _rust.ExtractionConfig()
return _rust.extract_file_sync(path=path, mime_type=mime_type, config=_rust_config)
def extract_bytes_sync(
content: bytes,
mime_type: str,
config: ExtractionConfig | None = None,
) -> ExtractionResult:
"""Synchronous wrapper for `extract_bytes`."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return _rust.extract_bytes_sync(content=content, mime_type=mime_type, config=_rust_config)
def batch_extract_files_sync(
items: list[BatchFileItem],
config: ExtractionConfig | None = None,
) -> list[ExtractionResult]:
"""Synchronous wrapper for `batch_extract_files`."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return _rust.batch_extract_files_sync(items=items, config=_rust_config)
def batch_extract_bytes_sync(
items: list[BatchBytesItem],
config: ExtractionConfig | None = None,
) -> list[ExtractionResult]:
"""Synchronous wrapper for `batch_extract_bytes`."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return _rust.batch_extract_bytes_sync(items=items, config=_rust_config)
async def batch_extract_files(
items: list[BatchFileItem],
config: ExtractionConfig | None = None,
) -> list[ExtractionResult]:
"""Extract content from multiple files concurrently."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return await _rust.batch_extract_files(items=items, config=_rust_config)
async def batch_extract_bytes(
items: list[BatchBytesItem],
config: ExtractionConfig | None = None,
) -> list[ExtractionResult]:
"""Extract content from multiple byte arrays concurrently."""
_rust_config = _to_rust_extraction_config(config)
if _rust_config is None:
_rust_config = _rust.ExtractionConfig()
return await _rust.batch_extract_bytes(items=items, config=_rust_config)
def detect_mime_type_from_bytes(content: bytes) -> str:
"""Detect MIME type from raw file bytes."""
return _rust.detect_mime_type_from_bytes(content=content)
def get_extensions_for_mime(mime_type: str) -> list[str]:
"""Get file extensions for a given MIME type."""
return _rust.get_extensions_for_mime(mime_type=mime_type)
def clear_embedding_backends() -> None:
"""Clear all embedding backends from the global registry."""
_rust.clear_embedding_backends()
def list_embedding_backends() -> list[str]:
"""List the names of all registered embedding backends."""
return _rust.list_embedding_backends()
def list_document_extractors() -> list[str]:
"""List names of all registered document extractors."""
return _rust.list_document_extractors()
def clear_document_extractors() -> None:
"""Clear all document extractors from the global registry."""
_rust.clear_document_extractors()
def list_ocr_backends() -> list[str]:
"""List all registered OCR backends."""
return _rust.list_ocr_backends()
def clear_ocr_backends() -> None:
"""Clear all OCR backends from the global registry."""
_rust.clear_ocr_backends()
def list_post_processors() -> list[str]:
"""List all registered post-processor names."""
return _rust.list_post_processors()
def clear_post_processors() -> None:
"""Remove all registered post-processors."""
_rust.clear_post_processors()
def list_renderers() -> list[str]:
"""List names of all registered renderers."""
return _rust.list_renderers()
def clear_renderers() -> None:
"""Clear all renderers from the global registry."""
_rust.clear_renderers()
def list_validators() -> list[str]:
"""List names of all registered validators."""
return _rust.list_validators()
def clear_validators() -> None:
"""Remove all registered validators."""
_rust.clear_validators()
def compare(
a: ExtractionResult | None = None,
b: ExtractionResult | None = None,
opts: DiffOptions | None = None,
) -> ExtractionDiff:
"""Compare two extraction results and return a structured diff."""
_rust_a = _to_rust_extraction_result(a)
if _rust_a is None:
_rust_a = _rust.ExtractionResult()
_rust_b = _to_rust_extraction_result(b)
if _rust_b is None:
_rust_b = _rust.ExtractionResult()
_rust_opts = _to_rust_diff_options(opts)
if _rust_opts is None:
_rust_opts = _rust.DiffOptions()
return _rust.compare(a=_rust_a, b=_rust_b, opts=_rust_opts)
async def embed_texts_async(
texts: list[str],
config: EmbeddingConfig | None = None,
) -> list[list[float]]:
"""Generate embeddings asynchronously for a list of text strings."""
_rust_config = _to_rust_embedding_config(config)
if _rust_config is None:
_rust_config = _rust.EmbeddingConfig()
return await _rust.embed_texts_async(texts=texts, config=_rust_config)
def render_pdf_page_to_png(
pdf_bytes: bytes,
page_index: int,
dpi: int | None = None,
password: str | None = None,
) -> bytes:
"""Render a single PDF page to PNG bytes."""
return _rust.render_pdf_page_to_png(pdf_bytes=pdf_bytes, page_index=page_index, dpi=dpi, password=password)
def detect_mime_type(path: str, check_exists: bool) -> str:
"""Detect the MIME type of a file at the given path."""
return _rust.detect_mime_type(path=path, check_exists=check_exists)
def embed_texts(texts: list[str], config: EmbeddingConfig | None = None) -> list[list[float]]:
"""Embed a list of texts using the configured embedding model."""
_rust_config = _to_rust_embedding_config(config)
if _rust_config is None:
_rust_config = _rust.EmbeddingConfig()
return _rust.embed_texts(texts=texts, config=_rust_config)
def get_embedding_preset(name: str) -> EmbeddingPreset | None:
"""Get an embedding preset by name."""
return _rust.get_embedding_preset(name=name)
def list_embedding_presets() -> list[str]:
"""List the names of all available embedding presets."""
return _rust.list_embedding_presets()
def register_ocr_backend(backend: object) -> None:
"""Register a register_ocr_backend backend."""
return _rust.register_ocr_backend(backend=backend)
def register_post_processor(backend: object) -> None:
"""Register a register_post_processor backend."""
return _rust.register_post_processor(backend=backend)
def register_validator(backend: object) -> None:
"""Register a register_validator backend."""
return _rust.register_validator(backend=backend)
def register_embedding_backend(backend: object) -> None:
"""Register a register_embedding_backend backend."""
return _rust.register_embedding_backend(backend=backend)
def register_document_extractor(backend: object) -> None:
"""Register a register_document_extractor backend."""
return _rust.register_document_extractor(backend=backend)
def register_renderer(backend: object) -> None:
"""Register a register_renderer backend."""
return _rust.register_renderer(backend=backend)
def unregister_ocr_backend(name: str) -> None:
"""Unregister the named unregister_ocr_backend backend."""
return _rust.unregister_ocr_backend(name=name)
def unregister_post_processor(name: str) -> None:
"""Unregister the named unregister_post_processor backend."""
return _rust.unregister_post_processor(name=name)
def unregister_validator(name: str) -> None:
"""Unregister the named unregister_validator backend."""
return _rust.unregister_validator(name=name)
def unregister_embedding_backend(name: str) -> None:
"""Unregister the named unregister_embedding_backend backend."""
return _rust.unregister_embedding_backend(name=name)
def unregister_document_extractor(name: str) -> None:
"""Unregister the named unregister_document_extractor backend."""
return _rust.unregister_document_extractor(name=name)
def unregister_renderer(name: str) -> None:
"""Unregister the named unregister_renderer backend."""
return _rust.unregister_renderer(name=name)