Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/packages/python/LICENSE
+++ b/packages/python/LICENSE
@@ -0,0 +1,93 @@
+Elastic License 2.0 (ELv2)
+
+Copyright 2025-2026 Kreuzberg, Inc.
+
+Acceptance
+
+By using the software, you agree to all of the terms and conditions below.
+
+Copyright License
+
+The licensor grants you a non-exclusive, royalty-free, worldwide,
+non-sublicensable, non-transferable license to use, copy, distribute, make
+available, and prepare derivative works of the software, in each case subject to
+the limitations and conditions below.
+
+Limitations
+
+You may not provide the software to third parties as a hosted or managed
+service, where the service provides users with access to any substantial set of
+the features or functionality of the software.
+
+You may not move, change, disable, or circumvent the license key functionality
+in the software, and you may not remove or obscure any functionality in the
+software that is protected by the license key.
+
+You may not alter, remove, or obscure any licensing, copyright, or other notices
+of the licensor in the software. Any use of the licensor's trademarks is subject
+to applicable law.
+
+Patents
+
+The licensor grants you a license, under any patent claims the licensor can
+license, or becomes able to license, to make, have made, use, sell, offer for
+sale, import and have imported the software, in each case subject to the
+limitations and conditions in this license. This license does not cover any
+patent claims that you cause to be infringed by modifications or additions to the
+software. If you or your company make any written claim that the software
+infringes or contributes to infringement of any patent, your patent license for
+the software granted under these terms ends immediately. If your company makes
+such a claim, your patent license ends immediately for work on behalf of your
+company.
+
+Notices
+
+You must ensure that anyone who gets a copy of any part of the software from you
+also gets a copy of these terms.
+
+If you modify the software, you must include in any modified copies of the
+software prominent notices stating that you have modified the software.
+
+No Other Rights
+
+These terms do not imply any licenses other than those expressly granted in
+these terms.
+
+Termination
+
+If you use the software in violation of these terms, such use is not licensed,
+and your licenses will automatically terminate. If the licensor provides you with
+a notice of your violation, and you cease all violation of this license no later
+than 30 days after you receive that notice, your licenses will be reinstated
+retroactively. However, if you violate these terms after such reinstatement, any
+additional violation of these terms will cause your licenses to terminate
+automatically and permanently.
+
+No Liability
+
+As far as the law allows, the software comes as is, without any warranty or
+condition, and the licensor will not be liable to you for any damages arising out
+of these terms or the use or nature of the software, under any kind of legal
+claim.
+
+Definitions
+
+The licensor is the entity offering these terms, and the software is the
+software the licensor makes available under these terms, including any portion
+of it.
+
+you refers to the individual or entity agreeing to these terms.
+
+your company is any legal entity, sole proprietorship, or other kind of
+organization that you work for, plus all organizations that have control over,
+are under the control of, or are under common control with that organization.
+control means ownership of substantially all the assets of an entity, or the
+power to direct its management and policies by vote, contract, or otherwise.
+Control can be direct or indirect.
+
+your licenses are all the licenses granted to you for the software under these
+terms.
+
+use means anything you do with the software requiring one of your licenses.
+
+trademark means trademarks, service marks, and similar rights.
--- a/packages/python/README.md
+++ b/packages/python/README.md
@@ -0,0 +1,600 @@
+# Kreuzberg
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
+  <a href="https://github.com/kreuzberg-dev/alef">
+    <img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
+  </a>
+  <!-- Language Bindings -->
+  <a href="https://crates.io/crates/kreuzberg">
+    <img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
+  </a>
+  <a href="https://pypi.org/project/kreuzberg/">
+    <img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/node">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
+  </a>
+  <a href="https://www.npmjs.com/package/@kreuzberg/wasm">
+    <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
+  </a>
+  <a href="https://www.nuget.org/packages/Kreuzberg/">
+    <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
+  </a>
+  <a href="https://packagist.org/packages/kreuzberg/kreuzberg">
+    <img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
+  </a>
+  <a href="https://rubygems.org/gems/kreuzberg">
+    <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
+  </a>
+  <a href="https://hex.pm/packages/kreuzberg">
+    <img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
+  </a>
+  <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
+    <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
+  </a>
+  <a href="https://pub.dev/packages/kreuzberg">
+    <img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
+  </a>
+  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
+    <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
+    <img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
+    <img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
+    <img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
+    <img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
+  </a>
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
+    <img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
+  </a>
+
+  <!-- Project Info -->
+  <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
+    <img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
+  </a>
+  <a href="https://docs.kreuzberg.dev">
+    <img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
+  </a>
+  <a href="https://huggingface.co/Kreuzberg">
+    <img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
+  </a>
+</div>
+
+<div align="center" style="margin: 24px 0 0;">
+  <a href="https://kreuzberg.dev">
+    <img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
+  </a>
+</div>
+
+<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
+  <a href="https://discord.gg/xt9WY3GnKR">
+    <img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
+  </a>
+  <a href="https://docs.kreuzberg.dev/demo.html">
+    <img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
+  </a>
+</div>
+
+Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Native Python bindings with async/await support, multiple OCR backends (Tesseract, EasyOCR, PaddleOCR), and extensible plugin system.
+
+## What This Package Provides
+
+- **Python-native extraction** — sync and async APIs for files, bytes, URLs, and batch ingestion.
+- **Structured results** — text, tables, images, metadata, language detection, chunks, and warnings in typed Python objects.
+- **OCR choices** — Tesseract, EasyOCR, PaddleOCR, and VLM OCR where configured.
+- **Same Rust engine as every binding** — behavior matches the Node.js, Ruby, Go, Java, .NET, PHP, Elixir, R, Dart, Swift, Zig, WASM, and C FFI packages.
+
+## Installation
+
+```bash
+pip install kreuzberg
+```
+
+### With OCR Support
+
+```bash
+pip install "kreuzberg[easyocr]"
+pip install "kreuzberg[paddleocr]"
+```
+
+### All Features
+
+```bash
+pip install "kreuzberg[all]"
+```
+
+## Quick Start
+
+### Basic Usage
+
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
+
+### Simple Extraction
+
+```python title="Python"
+import asyncio
+from pathlib import Path
+from kreuzberg import extract_file
+
+async def main() -> None:
+    file_path: Path = Path("document.pdf")
+
+    result = await extract_file(file_path)
+
+    print(f"Content: {result.content}")
+    print(f"Format: {result.metadata.format.format_type if result.metadata.format else None}")
+    print(f"Tables: {len(result.tables)}")
+
+asyncio.run(main())
+```
+
+### Reading Content
+
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file
+
+async def main() -> None:
+    result = await extract_file("document.pdf")
+
+    content: str = result.content
+    tables: int = len(result.tables)
+    format_type: str | None = result.metadata.format.format_type if result.metadata.format else None
+
+    print(f"Content length: {len(content)} characters")
+    print(f"Tables found: {tables}")
+    print(f"Format: {format_type}")
+
+asyncio.run(main())
+```
+
+## OCR Support
+
+### Using OCR
+
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, OcrConfig, TesseractConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        force_ocr=True,
+        ocr=OcrConfig(
+            backend="tesseract",
+            language="eng",
+            tesseract_config=TesseractConfig(psm=3)
+        )
+    )
+    result = await extract_file("scanned.pdf", config=config)
+    print(result.content)
+    print(f"Detected Languages: {result.detected_languages}")
+
+asyncio.run(main())
+```
+
+### EasyOCR (GPU-Accelerated)
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config = ExtractionConfig(
+    ocr=OcrConfig(backend="easyocr", language="en")
+)
+
+result = extract_file_sync(
+    "photo.jpg",
+    config=config,
+    easyocr_kwargs={"use_gpu": True}
+)
+```
+
+### PaddleOCR (Complex Layouts)
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config = ExtractionConfig(
+    ocr=OcrConfig(backend="paddleocr", language="ch")
+)
+
+result = extract_file_sync(
+    "invoice.pdf",
+    config=config,
+)
+```
+
+## Table Extraction
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig
+
+config = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        tesseract_config=TesseractConfig(
+            enable_table_detection=True
+        )
+    )
+)
+
+result = extract_file_sync("invoice.pdf", config=config)
+
+for table in result.tables:
+    print(table.markdown)
+    print(table.cells)
+```
+
+## Configuration
+
+### Complete Configuration Example
+
+```python
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    TesseractConfig,
+    ChunkingConfig,
+    ImageExtractionConfig,
+    PdfConfig,
+    TokenReductionConfig,
+    LanguageDetectionConfig,
+)
+
+config = ExtractionConfig(
+    use_cache=True,
+    enable_quality_processing=True,
+    ocr=OcrConfig(
+        backend="tesseract",
+        language="eng",
+        tesseract_config=TesseractConfig(
+            psm=6,
+            enable_table_detection=True,
+            min_confidence=50.0,
+        ),
+    ),
+    force_ocr=False,
+    chunking=ChunkingConfig(
+        max_chars=1000,
+        max_overlap=200,
+    ),
+    images=ImageExtractionConfig(
+        extract_images=True,
+        target_dpi=300,
+        max_image_dimension=4096,
+        auto_adjust_dpi=True,
+    ),
+    pdf_options=PdfConfig(
+        extract_images=True,
+        passwords=["password1", "password2"],
+        extract_metadata=True,
+    ),
+    token_reduction=TokenReductionConfig(
+        mode="moderate",
+        preserve_important_words=True,
+    ),
+    language_detection=LanguageDetectionConfig(
+        enabled=True,
+        min_confidence=0.8,
+        detect_multiple=False,
+    ),
+)
+
+result = extract_file_sync("document.pdf", config=config)
+```
+
+### HTML Conversion Options & Batch Concurrency
+
+```python
+from kreuzberg import ExtractionConfig
+
+config = ExtractionConfig(
+    max_concurrent_extractions=8,
+    html_options={
+        "extract_metadata": True,
+        "wrap": True,
+        "wrap_width": 100,
+        "strip_tags": ["script", "style"],
+        "preprocessing": {"enabled": True, "preset": "standard"},
+    },
+)
+```
+
+## Metadata Extraction
+
+```python
+from kreuzberg import extract_file_sync
+
+result = extract_file_sync("document.pdf")
+
+if result.images:
+    print(f"Extracted {len(result.images)} inline images")
+
+if result.chunks:
+    print(f"First chunk tokens: {result.chunks[0]['metadata']['token_count']}")
+
+print(result.metadata.get("pdf", {}))
+print(result.metadata.get("language"))
+print(result.metadata.get("format"))
+
+if "pdf" in result.metadata:
+    pdf_meta = result.metadata["pdf"]
+    print(f"Title: {pdf_meta.get('title')}")
+    print(f"Author: {pdf_meta.get('author')}")
+    print(f"Pages: {pdf_meta.get('page_count')}")
+    print(f"Created: {pdf_meta.get('creation_date')}")
+```
+
+## Password-Protected PDFs
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig
+
+config = ExtractionConfig(
+    pdf_options=PdfConfig(
+        passwords=["password1", "password2", "password3"]
+    )
+)
+
+result = extract_file_sync("protected.pdf", config=config)
+```
+
+## Language Detection
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
+
+config = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(enabled=True)
+)
+
+result = extract_file_sync("multilingual.pdf", config=config)
+print(result.detected_languages)
+```
+
+## Text Chunking
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1000,
+        max_overlap=200,
+    )
+)
+
+result = extract_file_sync("long_document.pdf", config=config)
+
+for chunk in result.chunks:
+    print(chunk)
+```
+
+## Extract from Bytes
+
+```python
+from kreuzberg import extract_bytes_sync
+
+with open("document.pdf", "rb") as f:
+    data = f.read()
+
+result = extract_bytes_sync(data, "application/pdf")
+print(result.content)
+```
+
+## API Reference
+
+### Extraction Functions
+
+- `extract_file(file_path, mime_type=None, config=None, **kwargs)` – Async extraction
+- `extract_file_sync(file_path, mime_type=None, config=None, **kwargs)` – Sync extraction
+- `extract_bytes(data, mime_type, config=None, **kwargs)` – Async extraction from bytes
+- `extract_bytes_sync(data, mime_type, config=None, **kwargs)` – Sync extraction from bytes
+- `batch_extract_files(paths, config=None, **kwargs)` – Async batch extraction
+- `batch_extract_files_sync(paths, config=None, **kwargs)` – Sync batch extraction
+- `batch_extract_bytes(data_list, mime_types, config=None, **kwargs)` – Async batch from bytes
+- `batch_extract_bytes_sync(data_list, mime_types, config=None, **kwargs)` – Sync batch from bytes
+
+### Configuration Classes
+
+- `ExtractionConfig` – Main configuration
+- `OcrConfig` – OCR settings
+- `TesseractConfig` – Tesseract-specific options
+- `ChunkingConfig` – Text chunking settings
+- `ImageExtractionConfig` – Image extraction settings
+- `PdfConfig` – PDF-specific options
+- `TokenReductionConfig` – Token reduction settings
+- `LanguageDetectionConfig` – Language detection settings
+
+### Result Types
+
+- `ExtractionResult` – Main result object with `content`, `metadata`, `tables`, `detected_languages`, `chunks`
+- `ExtractedTable` – Table with `cells`, `markdown`, `page_number`
+- `Metadata` – Typed metadata dictionary
+
+### Exceptions
+
+- `KreuzbergError` – Base exception
+- `ValidationError` – Invalid configuration or input
+- `ParsingError` – Document parsing failure
+- `OCRError` – OCR processing failure
+- `MissingDependencyError` – Missing optional dependency
+
+## Examples
+
+### Custom Processing
+
+```python
+from kreuzberg import extract_file_sync
+
+result = extract_file_sync("document.pdf")
+
+text = result.content
+text = text.lower()
+text = text.replace("old", "new")
+
+print(text)
+```
+
+### Multiple Files with Progress
+
+```python
+from kreuzberg import extract_file_sync
+from pathlib import Path
+
+files = list(Path("documents").glob("*.pdf"))
+results = []
+
+for i, file in enumerate(files, 1):
+    print(f"Processing {i}/{len(files)}: {file.name}")
+    result = extract_file_sync(str(file))
+    results.append((file.name, result))
+
+for name, result in results:
+    print(f"{name}: {len(result.content)} characters")
+```
+
+### Filter by Language
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
+
+config = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(enabled=True)
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+if result.detected_languages and "en" in result.detected_languages:
+    print("English document detected")
+    print(result.content)
+```
+
+## System Requirements
+
+### ONNX Runtime (for embeddings)
+
+If using embeddings functionality, ONNX Runtime version 1.22.x must be installed:
+
+```bash
+# macOS
+brew install onnxruntime
+
+# Ubuntu/Debian (download from GitHub - Debian packages may have older versions)
+# Download from https://github.com/microsoft/onnxruntime/releases
+
+# Windows
+# Download from https://github.com/microsoft/onnxruntime/releases
+```
+
+**Important:** Kreuzberg requires ONNX Runtime version 1.22.x for embeddings.
+
+Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
+
+### Tesseract OCR (Required for OCR)
+
+```bash
+brew install tesseract
+```
+
+```bash
+sudo apt-get install tesseract-ocr
+```
+
+### Pandoc (Optional, for some formats)
+
+```bash
+brew install pandoc
+```
+
+```bash
+sudo apt-get install pandoc
+```
+
+## Troubleshooting
+
+### Import Error: No module named '\_kreuzberg'
+
+This usually means the Rust extension wasn't built correctly. Try:
+
+```bash
+pip install --force-reinstall --no-cache-dir kreuzberg
+```
+
+### OCR Not Working
+
+Make sure Tesseract is installed:
+
+```bash
+tesseract --version
+```
+
+### Memory Issues with Large PDFs
+
+Use streaming or enable chunking:
+
+```python
+config = ExtractionConfig(
+    chunking=ChunkingConfig(max_chars=1000)
+)
+```
+
+## PDFium Integration
+
+PDF extraction is powered by PDFium, which is automatically bundled with this package. No system installation required.
+
+### Platform Support
+
+| Platform       | Status | Notes   |
+| -------------- | ------ | ------- |
+| Linux x86_64   | ✅     | Bundled |
+| macOS ARM64    | ✅     | Bundled |
+| macOS x86_64   | ✅     | Bundled |
+| Windows x86_64 | ✅     | Bundled |
+
+### Binary Size Impact
+
+PDFium adds approximately 8-15 MB to the package size depending on platform. This ensures consistent PDF extraction across all environments without external dependencies.
+
+## Documentation
+
+For comprehensive documentation, visit [https://kreuzberg.dev](https://kreuzberg.dev)
+
+## Part of Kreuzberg.dev
+
+- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
+- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
+- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
+- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
+- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
+- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
+- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
+
+## License
+
+Elastic-2.0 License - see [LICENSE](../../LICENSE) for details.
--- a/packages/python/kreuzberg/init.py
+++ b/packages/python/kreuzberg/init.py
@@ -0,0 +1,497 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+"""Public API for _kreuzberg.
+
+Version: 5.0.0-rc.3
+"""
+
+from ._kreuzberg import (
+    AnnotationKind,
+    ArchiveEntry,
+    BatchBytesItem,
+    BatchFileItem,
+    BBox,
+    BlockType,
+    CacheStats,
+    CellChange,
+    Chunk,
+    ChunkerType,
+    ChunkMetadata,
+    ChunkSizing,
+    ChunkType,
+    CodeContentMode,
+    ContentLayer,
+    ContributorRole,
+    DbfFieldInfo,
+    DetectionResult,
+    DetectResponse,
+    DiffHunk,
+    DiffLine,
+    DjotContent,
+    DjotImage,
+    DjotLink,
+    DocumentNode,
+    DocumentRelationship,
+    DocumentRevision,
+    Element,
+    ElementMetadata,
+    ElementType,
+    EmailAttachment,
+    EmailExtractionResult,
+    EmbeddedChanges,
+    EmbeddedDiff,
+    EmbeddedFile,
+    EmbeddingModelType,
+    EmbeddingPreset,
+    ErrorMetadata,
+    ExcelSheet,
+    ExcelWorkbook,
+    ExecutionProviderType,
+    ExtractedImage,
+    ExtractedUri,
+    ExtractionDiff,
+    ExtractionMethod,
+    Footnote,
+    FormatMetadata,
+    FormattedBlock,
+    GridCell,
+    HeaderMetadata,
+    HeadingContext,
+    HeadingLevel,
+    HierarchicalBlock,
+    HtmlTheme,
+    ImageKind,
+    ImageMetadataType,
+    ImagePreprocessingMetadata,
+    ImageType,
+    InlineElement,
+    InlineType,
+    Keyword,
+    KeywordAlgorithm,
+    LayoutClass,
+    LayoutDetection,
+    LinkMetadata,
+    LinkType,
+    ListType,
+    ModelPaths,
+    NodeContent,
+    OcrBackendType,
+    OcrBoundingGeometry,
+    OcrElementLevel,
+    OcrExtractionResult,
+    OcrPipelineConfig,
+    OcrPipelineStage,
+    OcrRotation,
+    OcrTable,
+    OcrTableBoundingBox,
+    OrientationResult,
+    OutputFormat,
+    PaddleLanguage,
+    PageBoundary,
+    PageContent,
+    PageHierarchy,
+    PageInfo,
+    PageStructure,
+    PageUnitType,
+    PdfAnnotation,
+    PdfAnnotationType,
+    PptxExtractionResult,
+    ProcessingStage,
+    ProcessingWarning,
+    PSMMode,
+    RecognizedTable,
+    ReductionLevel,
+    RelationshipKind,
+    ResultFormat,
+    RevisionAnchor,
+    RevisionKind,
+    StructuredData,
+    StructuredDataResult,
+    StructuredDataType,
+    StructuredExtractionConfig,
+    SupportedFormat,
+    TableDiff,
+    TableModel,
+    TextAnnotation,
+    TextDirection,
+    TextExtractionResult,
+    UriKind,
+    XmlExtractionResult,
+    YearRange,
+)
+from .api import (
+    batch_extract_bytes,
+    batch_extract_bytes_sync,
+    batch_extract_files,
+    batch_extract_files_sync,
+    clear_document_extractors,
+    clear_embedding_backends,
+    clear_ocr_backends,
+    clear_post_processors,
+    clear_renderers,
+    clear_validators,
+    compare,
+    detect_mime_type,
+    detect_mime_type_from_bytes,
+    embed_texts,
+    embed_texts_async,
+    extract_bytes,
+    extract_bytes_sync,
+    extract_file,
+    extract_file_sync,
+    get_embedding_preset,
+    get_extensions_for_mime,
+    list_document_extractors,
+    list_embedding_backends,
+    list_embedding_presets,
+    list_ocr_backends,
+    list_post_processors,
+    list_renderers,
+    list_validators,
+    register_document_extractor,
+    register_embedding_backend,
+    register_ocr_backend,
+    register_post_processor,
+    register_renderer,
+    register_validator,
+    render_pdf_page_to_png,
+    unregister_document_extractor,
+    unregister_embedding_backend,
+    unregister_ocr_backend,
+    unregister_post_processor,
+    unregister_renderer,
+    unregister_validator,
+)
+from .exceptions import (
+    CacheError,
+    CancelledError,
+    EmbeddingError,
+    ImageProcessingError,
+    IoError,
+    KreuzbergError,
+    KreuzbergTimeoutError,
+    LockPoisonedError,
+    MissingDependencyError,
+    OcrError,
+    OtherError,
+    ParsingError,
+    PluginError,
+    SecurityError,
+    SerializationError,
+    UnsupportedFormatError,
+    ValidationError,
+)
+from .options import (
+    AccelerationConfig,
+    ArchiveMetadata,
+    BibtexMetadata,
+    BoundingBox,
+    ChunkingConfig,
+    CitationMetadata,
+    ContentFilterConfig,
+    CoreProperties,
+    CsvMetadata,
+    DbfMetadata,
+    DiffOptions,
+    DocumentStructure,
+    DocxAppProperties,
+    DocxMetadata,
+    EmailConfig,
+    EmailMetadata,
+    EmbeddingConfig,
+    EpubMetadata,
+    ExcelMetadata,
+    ExtractionConfig,
+    ExtractionResult,
+    FictionBookMetadata,
+    FileExtractionConfig,
+    HierarchyConfig,
+    HtmlMetadata,
+    HtmlOutputConfig,
+    ImageExtractionConfig,
+    ImageMetadata,
+    ImagePreprocessingConfig,
+    JatsMetadata,
+    KeywordConfig,
+    LanguageDetectionConfig,
+    LayoutDetectionConfig,
+    LayoutRegion,
+    LlmConfig,
+    LlmUsage,
+    Metadata,
+    OcrConfidence,
+    OcrConfig,
+    OcrElement,
+    OcrElementConfig,
+    OcrMetadata,
+    OcrQualityThresholds,
+    PaddleOcrConfig,
+    PageConfig,
+    PdfConfig,
+    PdfMetadata,
+    PostProcessorConfig,
+    PptxAppProperties,
+    PptxMetadata,
+    PstMetadata,
+    RakeParams,
+    RevisionDelta,
+    SecurityLimits,
+    ServerConfig,
+    Table,
+    TableCell,
+    TableGrid,
+    TesseractConfig,
+    TextMetadata,
+    TokenReductionConfig,
+    TokenReductionOptions,
+    TreeSitterConfig,
+    TreeSitterProcessConfig,
+    XlsxAppProperties,
+    XmlMetadata,
+    YakeParams,
+)
+
+__all__ = [
+    "AccelerationConfig",
+    "AnnotationKind",
+    "ArchiveEntry",
+    "ArchiveMetadata",
+    "BBox",
+    "BatchBytesItem",
+    "BatchFileItem",
+    "BibtexMetadata",
+    "BlockType",
+    "BoundingBox",
+    "CacheError",
+    "CacheStats",
+    "CancelledError",
+    "CellChange",
+    "Chunk",
+    "ChunkMetadata",
+    "ChunkSizing",
+    "ChunkType",
+    "ChunkerType",
+    "ChunkingConfig",
+    "CitationMetadata",
+    "CodeContentMode",
+    "ContentFilterConfig",
+    "ContentLayer",
+    "ContributorRole",
+    "CoreProperties",
+    "CsvMetadata",
+    "DbfFieldInfo",
+    "DbfMetadata",
+    "DetectResponse",
+    "DetectionResult",
+    "DiffHunk",
+    "DiffLine",
+    "DiffOptions",
+    "DjotContent",
+    "DjotImage",
+    "DjotLink",
+    "DocumentNode",
+    "DocumentRelationship",
+    "DocumentRevision",
+    "DocumentStructure",
+    "DocxAppProperties",
+    "DocxMetadata",
+    "Element",
+    "ElementMetadata",
+    "ElementType",
+    "EmailAttachment",
+    "EmailConfig",
+    "EmailExtractionResult",
+    "EmailMetadata",
+    "EmbeddedChanges",
+    "EmbeddedDiff",
+    "EmbeddedFile",
+    "EmbeddingConfig",
+    "EmbeddingError",
+    "EmbeddingModelType",
+    "EmbeddingPreset",
+    "EpubMetadata",
+    "ErrorMetadata",
+    "ExcelMetadata",
+    "ExcelSheet",
+    "ExcelWorkbook",
+    "ExecutionProviderType",
+    "ExtractedImage",
+    "ExtractedUri",
+    "ExtractionConfig",
+    "ExtractionDiff",
+    "ExtractionMethod",
+    "ExtractionResult",
+    "FictionBookMetadata",
+    "FileExtractionConfig",
+    "Footnote",
+    "FormatMetadata",
+    "FormattedBlock",
+    "GridCell",
+    "HeaderMetadata",
+    "HeadingContext",
+    "HeadingLevel",
+    "HierarchicalBlock",
+    "HierarchyConfig",
+    "HtmlMetadata",
+    "HtmlOutputConfig",
+    "HtmlTheme",
+    "ImageExtractionConfig",
+    "ImageKind",
+    "ImageMetadata",
+    "ImageMetadataType",
+    "ImagePreprocessingConfig",
+    "ImagePreprocessingMetadata",
+    "ImageProcessingError",
+    "ImageType",
+    "InlineElement",
+    "InlineType",
+    "IoError",
+    "JatsMetadata",
+    "Keyword",
+    "KeywordAlgorithm",
+    "KeywordConfig",
+    "KreuzbergError",
+    "KreuzbergTimeoutError",
+    "LanguageDetectionConfig",
+    "LayoutClass",
+    "LayoutDetection",
+    "LayoutDetectionConfig",
+    "LayoutRegion",
+    "LinkMetadata",
+    "LinkType",
+    "ListType",
+    "LlmConfig",
+    "LlmUsage",
+    "LockPoisonedError",
+    "Metadata",
+    "MissingDependencyError",
+    "ModelPaths",
+    "NodeContent",
+    "OcrBackendType",
+    "OcrBoundingGeometry",
+    "OcrConfidence",
+    "OcrConfig",
+    "OcrElement",
+    "OcrElementConfig",
+    "OcrElementLevel",
+    "OcrError",
+    "OcrExtractionResult",
+    "OcrMetadata",
+    "OcrPipelineConfig",
+    "OcrPipelineStage",
+    "OcrQualityThresholds",
+    "OcrRotation",
+    "OcrTable",
+    "OcrTableBoundingBox",
+    "OrientationResult",
+    "OtherError",
+    "OutputFormat",
+    "PSMMode",
+    "PaddleLanguage",
+    "PaddleOcrConfig",
+    "PageBoundary",
+    "PageConfig",
+    "PageContent",
+    "PageHierarchy",
+    "PageInfo",
+    "PageStructure",
+    "PageUnitType",
+    "ParsingError",
+    "PdfAnnotation",
+    "PdfAnnotationType",
+    "PdfConfig",
+    "PdfMetadata",
+    "PluginError",
+    "PostProcessorConfig",
+    "PptxAppProperties",
+    "PptxExtractionResult",
+    "PptxMetadata",
+    "ProcessingStage",
+    "ProcessingWarning",
+    "PstMetadata",
+    "RakeParams",
+    "RecognizedTable",
+    "ReductionLevel",
+    "RelationshipKind",
+    "ResultFormat",
+    "RevisionAnchor",
+    "RevisionDelta",
+    "RevisionKind",
+    "SecurityError",
+    "SecurityLimits",
+    "SerializationError",
+    "ServerConfig",
+    "StructuredData",
+    "StructuredDataResult",
+    "StructuredDataType",
+    "StructuredExtractionConfig",
+    "SupportedFormat",
+    "Table",
+    "TableCell",
+    "TableDiff",
+    "TableGrid",
+    "TableModel",
+    "TesseractConfig",
+    "TextAnnotation",
+    "TextDirection",
+    "TextExtractionResult",
+    "TextMetadata",
+    "TokenReductionConfig",
+    "TokenReductionOptions",
+    "TreeSitterConfig",
+    "TreeSitterProcessConfig",
+    "UnsupportedFormatError",
+    "UriKind",
+    "ValidationError",
+    "XlsxAppProperties",
+    "XmlExtractionResult",
+    "XmlMetadata",
+    "YakeParams",
+    "YearRange",
+    "batch_extract_bytes",
+    "batch_extract_bytes_sync",
+    "batch_extract_files",
+    "batch_extract_files_sync",
+    "clear_document_extractors",
+    "clear_embedding_backends",
+    "clear_ocr_backends",
+    "clear_post_processors",
+    "clear_renderers",
+    "clear_validators",
+    "compare",
+    "detect_mime_type",
+    "detect_mime_type_from_bytes",
+    "embed_texts",
+    "embed_texts_async",
+    "extract_bytes",
+    "extract_bytes_sync",
+    "extract_file",
+    "extract_file_sync",
+    "get_embedding_preset",
+    "get_extensions_for_mime",
+    "list_document_extractors",
+    "list_embedding_backends",
+    "list_embedding_presets",
+    "list_ocr_backends",
+    "list_post_processors",
+    "list_renderers",
+    "list_validators",
+    "register_document_extractor",
+    "register_embedding_backend",
+    "register_ocr_backend",
+    "register_post_processor",
+    "register_renderer",
+    "register_validator",
+    "render_pdf_page_to_png",
+    "unregister_document_extractor",
+    "unregister_embedding_backend",
+    "unregister_ocr_backend",
+    "unregister_post_processor",
+    "unregister_renderer",
+    "unregister_validator",
+]
+
+__version__ = "5.0.0-rc.3"
--- a/packages/python/kreuzberg/_kreuzberg.pyi
+++ b/packages/python/kreuzberg/_kreuzberg.pyi
--- a/packages/python/kreuzberg/api.py
+++ b/packages/python/kreuzberg/api.py
--- a/packages/python/kreuzberg/exceptions.py
+++ b/packages/python/kreuzberg/exceptions.py
@@ -0,0 +1,74 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+"""Exception hierarchy."""
+
+
+class KreuzbergError(Exception):
+    """Main error type for all Kreuzberg operations."""
+
+
+class IoError(KreuzbergError):
+    """Io error."""
+
+
+class ParsingError(KreuzbergError):
+    """Parsing error."""
+
+
+class OcrError(KreuzbergError):
+    """Ocr error."""
+
+
+class ValidationError(KreuzbergError):
+    """Validation error."""
+
+
+class CacheError(KreuzbergError):
+    """Cache error."""
+
+
+class ImageProcessingError(KreuzbergError):
+    """Image processing error."""
+
+
+class SerializationError(KreuzbergError):
+    """Serialization error."""
+
+
+class MissingDependencyError(KreuzbergError):
+    """Missing dependency error."""
+
+
+class PluginError(KreuzbergError):
+    """Plugin error."""
+
+
+class LockPoisonedError(KreuzbergError):
+    """Lock poisoned error."""
+
+
+class UnsupportedFormatError(KreuzbergError):
+    """Unsupported format error."""
+
+
+class EmbeddingError(KreuzbergError):
+    """Embedding error."""
+
+
+class KreuzbergTimeoutError(KreuzbergError):
+    """Kreuzberg timeout error."""
+
+
+class CancelledError(KreuzbergError):
+    """Cancelled error."""
+
+
+class SecurityError(KreuzbergError):
+    """Security error."""
+
+
+class OtherError(KreuzbergError):
+    """Other error."""
--- a/packages/python/kreuzberg/options.py
+++ b/packages/python/kreuzberg/options.py
--- a/packages/python/kreuzberg/py.typed
+++ b/packages/python/kreuzberg/py.typed
--- a/packages/python/pyproject.toml
+++ b/packages/python/pyproject.toml
@@ -0,0 +1,98 @@
+[build-system]
+build-backend = "maturin"
+requires = [ "maturin>=1,<2" ]
+
+[project]
+name = "kreuzberg"
+version = "5.0.0rc3"
+description = "High-performance document intelligence library"
+keywords = [ "document", "extraction", "ocr", "pdf", "text" ]
+license = "Elastic-2.0"
+license-files = [ "LICENSE" ]
+authors = [ { name = "Na'aman Hirschfeld <naaman@kreuzberg.dev>" } ]
+requires-python = ">=3.10"
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
+]
+urls.repository = "https://github.com/kreuzberg-dev/kreuzberg"
+homepage = "https://kreuzberg.dev"
+
+[dependency-groups]
+dev = [ "mypy>=1.19", "ruff>=0.14.8" ]
+
+[tool.maturin]
+module-name = "kreuzberg._kreuzberg"
+manifest-path = "../../crates/kreuzberg-py/Cargo.toml"
+# abi3-py310 produces a single wheel per platform that loads on Python 3.10+,
+# avoiding a per-Python-version build matrix.
+features = [ "pyo3/extension-module", "pyo3/abi3-py310" ]
+python-packages = [ "kreuzberg" ]
+
+[tool.ruff]
+target-version = "py310"
+line-length = 120
+format.docstring-code-line-length = 120
+format.docstring-code-format = true
+lint.select = [ "ALL" ]
+lint.ignore = [
+  "ANN401",
+  "ASYNC109",
+  "ASYNC110",
+  "BLE001",
+  "COM812",
+  "D100",
+  "D104",
+  "D107",
+  "D205",
+  "E501",
+  "EM",
+  "FBT",
+  "FIX",
+  "ISC001",
+  "PD011",
+  "PGH003",
+  "PLR2004",
+  "PLW0603",
+  "S104",
+  "S110",
+  "S603",
+  "TD",
+  "TRY",
+]
+lint.per-file-ignores."kreuzberg/__init__.py" = [ "I001" ]
+# The alef Python codegen still emits cosmetic warnings on the wrapper
+# modules: api.py keeps the legacy `from typing import AsyncIterator` and a
+# single-line import block, options.py carries # noqa: TC001 / F401 markers
+# that turn out unused on every regen, __init__.py star-imports re-sort with
+# a different convention. Silence these specific rules on the wrappers until
+# the codegen is updated to emit ruff-clean output.
+lint.per-file-ignores."kreuzberg/api.py" = [ "F401", "I001", "UP035" ]
+lint.per-file-ignores."kreuzberg/options.py" = [ "F401", "RUF100" ]
+lint.per-file-ignores."tests/**" = [ "ANN", "D103", "PLR2004", "S101" ]
+lint.mccabe.max-complexity = 15
+lint.pydocstyle.convention = "google"
+lint.pylint.max-args = 10
+lint.pylint.max-branches = 15
+lint.pylint.max-returns = 10
+
+[tool.mypy]
+python_version = "3.10"
+strict = true
+show_error_codes = true
+implicit_reexport = false
+namespace_packages = true
+overrides = [
+  # The alef-emitted `api.py` wrapper has a structural mismatch between its
+  # `options.*` dataclass signatures and the `_internal_bindings.*` pyclass
+  # types pyo3 accepts/returns at runtime. pyo3 reconciles them dynamically via
+  # FromPyObject — the Python e2e suite exercises the runtime path — but mypy
+  # sees only the static-type discrepancy. Disable the four error codes the
+  # discrepancy raises until the codegen emits matching `_to_rust_*` calls and
+  # casts the return values.
+  { module = "kreuzberg.api", disable_error_code = [ "call-arg", "arg-type", "return-value", "attr-defined" ] },
+]