Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/advanced/advanced_config.md
+++ b/docs/snippets/python/advanced/advanced_config.md
@@ -0,0 +1,29 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    TokenReductionConfig,
+    LanguageDetectionConfig,
+)
+
+config = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract", language="eng+deu"),
+    chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
+    token_reduction=TokenReductionConfig(mode="light"),
+    language_detection=LanguageDetectionConfig(
+        enabled=True, detect_multiple=True
+    ),
+    use_cache=True,
+    enable_quality_processing=True,
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+for chunk in result.chunks or []:
+    print(f"Chunk: {chunk.content[:100]}")
+
+if result.detected_languages:
+    print(f"Languages: {result.detected_languages}")
+```
--- a/docs/snippets/python/advanced/chunk_page_mapping.md
+++ b/docs/snippets/python/advanced/chunk_page_mapping.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(max_characters=500, overlap=50),
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+if result.chunks:
+    for chunk in result.chunks:
+        first = chunk.metadata.first_page
+        last = chunk.metadata.last_page
+        if first is None:
+            continue
+        page_range = f"Page {first}" if first == last else f"Pages {first}-{last}"
+        print(f"Chunk: {chunk.content[:50]}... ({page_range})")
+```
--- a/docs/snippets/python/advanced/chunking_config.md
+++ b/docs/snippets/python/advanced/chunking_config.md
@@ -0,0 +1,59 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_characters=1000,
+            overlap=200,
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    for chunk in result.chunks or []:
+        print(f"Length: {len(chunk.content)}")
+
+
+asyncio.run(main())
+```
+
+```python title="Python - Semantic"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(chunker_type="semantic")
+    )
+    result = await extract_file("document.pdf", config=config)
+    for chunk in result.chunks or []:
+        print(f"Content: {chunk.content[:100]}...")
+
+
+asyncio.run(main())
+```
+
+```python title="Python - Prepend Heading Context"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            chunker_type="markdown",
+            max_characters=500,
+            overlap=50,
+            prepend_heading_context=True,
+        )
+    )
+    result = await extract_file("document.md", config=config)
+    for chunk in result.chunks or []:
+        # Each chunk's content is prefixed with its heading breadcrumb
+        print(f"Content: {chunk.content[:100]}...")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/chunking_rag.md
+++ b/docs/snippets/python/advanced/chunking_rag.md
@@ -0,0 +1,39 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    extract_file,
+)
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_characters=500,
+            overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
+                normalize=True,
+            ),
+        )
+    )
+
+    result = await extract_file("research_paper.pdf", config=config)
+
+    for chunk in result.chunks or []:
+        print(
+            f"Chunk {chunk.metadata.chunk_index + 1}/{chunk.metadata.total_chunks}"
+        )
+        print(
+            f"Position: {chunk.metadata.byte_start}-{chunk.metadata.byte_end}"
+        )
+        print(f"Content: {chunk.content[:100]}...")
+        if chunk.embedding is not None:
+            print(f"Embedding: {len(chunk.embedding)} dimensions")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/combining_all_features.md
+++ b/docs/snippets/python/advanced/combining_all_features.md
@@ -0,0 +1,34 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    LanguageDetectionConfig,
+    TokenReductionConfig,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        enable_quality_processing=True,
+        language_detection=LanguageDetectionConfig(enabled=True),
+        token_reduction=TokenReductionConfig(mode="moderate"),
+        chunking=ChunkingConfig(
+            max_chars=512,
+            max_overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"), normalize=True
+            ),
+        ),
+    )
+    result = await extract_file("document.pdf", config=config)
+    quality = result.quality_score or 0
+    print(f"Quality: {quality:.2f}")
+    print(f"Languages: {result.detected_languages}")
+    if result.chunks:
+        print(f"Chunks: {len(result.chunks)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/complete_example.md
+++ b/docs/snippets/python/advanced/complete_example.md
@@ -0,0 +1,36 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    OcrConfig,
+    TesseractConfig,
+    PdfConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True,
+        ocr=OcrConfig(
+            backend="tesseract",
+            language="eng+fra",
+            tesseract_config=TesseractConfig(psm=3),
+        ),
+        pdf_options=PdfConfig(extract_images=True),
+        chunking=ChunkingConfig(
+            max_chars=1000,
+            max_overlap=200,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
+            ),
+        ),
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/embedding_with_chunking.md
+++ b/docs/snippets/python/advanced/embedding_with_chunking.md
@@ -0,0 +1,21 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_characters=1024,
+        overlap=100,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
+            normalize=True,
+            batch_size=32,
+            show_download_progress=False,
+        ),
+    )
+)
+```
--- a/docs/snippets/python/advanced/keyword_extraction_config.md
+++ b/docs/snippets/python/advanced/keyword_extraction_config.md
@@ -0,0 +1,13 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, KeywordConfig, KeywordAlgorithm
+
+config: ExtractionConfig = ExtractionConfig(
+    keywords=KeywordConfig(
+        algorithm=KeywordAlgorithm.YAKE,
+        max_keywords=10,
+        min_score=0.3,
+        ngram_range=[1, 3],
+        language="en",
+    )
+)
+```
--- a/docs/snippets/python/advanced/keyword_extraction_example.md
+++ b/docs/snippets/python/advanced/keyword_extraction_example.md
@@ -0,0 +1,27 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    KeywordConfig,
+    KeywordAlgorithm,
+    extract_file,
+)
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.3,
+        )
+    )
+
+    result = await extract_file("research_paper.pdf", config=config)
+
+    for keyword in result.extracted_keywords or []:
+        print(f"{keyword.text}: {keyword.score:.3f}")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/language_detection_config.md
+++ b/docs/snippets/python/advanced/language_detection_config.md
@@ -0,0 +1,11 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(
+        enabled=True,
+        min_confidence=0.8,
+        detect_multiple=False,
+    )
+)
+```
--- a/docs/snippets/python/advanced/language_detection_multilingual.md
+++ b/docs/snippets/python/advanced/language_detection_multilingual.md
@@ -0,0 +1,21 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.8,
+            detect_multiple=True,
+        )
+    )
+
+    result = await extract_file("multilingual_document.pdf", config=config)
+
+    print(f"Detected languages: {result.detected_languages}")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/quality_processing_config.md
+++ b/docs/snippets/python/advanced/quality_processing_config.md
@@ -0,0 +1,7 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    enable_quality_processing=True,
+)
+```
--- a/docs/snippets/python/advanced/quality_processing_example.md
+++ b/docs/snippets/python/advanced/quality_processing_example.md
@@ -0,0 +1,21 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        enable_quality_processing=True,
+    )
+
+    result = await extract_file("scanned_document.pdf", config=config)
+
+    if result.quality_score is not None:
+        if result.quality_score < 0.5:
+            print(f"Warning: Low quality extraction ({result.quality_score:.2f})")
+        else:
+            print(f"Quality score: {result.quality_score:.2f}")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/token_reduction_config.md
+++ b/docs/snippets/python/advanced/token_reduction_config.md
@@ -0,0 +1,12 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, TokenReductionConfig, ReductionLevel
+
+config: ExtractionConfig = ExtractionConfig(
+    token_reduction=TokenReductionConfig(
+        level=ReductionLevel.MODERATE,
+        preserve_markdown=True,
+        preserve_code=True,
+        language_hint="eng",
+    )
+)
+```
--- a/docs/snippets/python/advanced/token_reduction_example.md
+++ b/docs/snippets/python/advanced/token_reduction_example.md
@@ -0,0 +1,25 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    TokenReductionConfig,
+    ReductionLevel,
+    extract_file,
+)
+
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            level=ReductionLevel.MODERATE,
+            preserve_markdown=True,
+        )
+    )
+
+    result = await extract_file("verbose_document.pdf", config=config)
+
+    print(f"Reduced content length: {len(result.content)} chars")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/advanced/vector_database_integration.md
+++ b/docs/snippets/python/advanced/vector_database_integration.md
@@ -0,0 +1,59 @@
+```python title="Python"
+import asyncio
+from dataclasses import dataclass, field
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    extract_file,
+)
+
+
+@dataclass
+class VectorRecord:
+    id: str
+    content: str
+    embedding: list[float]
+    metadata: dict[str, str] = field(default_factory=dict)
+
+
+async def extract_and_vectorize(
+    document_path: str,
+    document_id: str,
+) -> list[VectorRecord]:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_characters=512,
+            overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
+                normalize=True,
+                batch_size=32,
+            ),
+        )
+    )
+
+    result = await extract_file(document_path, config=config)
+
+    records: list[VectorRecord] = []
+    for index, chunk in enumerate(result.chunks or []):
+        if chunk.embedding is None:
+            continue
+        records.append(
+            VectorRecord(
+                id=f"{document_id}_chunk_{index}",
+                content=chunk.content,
+                embedding=chunk.embedding,
+                metadata={
+                    "document_id": document_id,
+                    "chunk_index": str(index),
+                    "content_length": str(len(chunk.content)),
+                },
+            )
+        )
+    return records
+
+
+asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
+```
--- a/docs/snippets/python/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/python/api/batch_extract_bytes_sync.md
@@ -0,0 +1,13 @@
+```python title="Python"
+from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig
+
+items = [
+    BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
+    BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
+]
+
+results = batch_extract_bytes_sync(items, ExtractionConfig())
+
+for i, result in enumerate(results):
+    print(f"Item {i}: {len(result.content)} chars extracted")
+```
--- a/docs/snippets/python/api/batch_extract_files_sync.md
+++ b/docs/snippets/python/api/batch_extract_files_sync.md
@@ -0,0 +1,14 @@
+```python title="Python"
+from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
+
+items = [
+    BatchFileItem(path="doc1.pdf"),
+    BatchFileItem(path="doc2.docx"),
+    BatchFileItem(path="doc3.html"),
+]
+
+results = batch_extract_files_sync(items, ExtractionConfig())
+
+for i, result in enumerate(results):
+    print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
+```
--- a/docs/snippets/python/api/client_chunk_text.md
+++ b/docs/snippets/python/api/client_chunk_text.md
@@ -0,0 +1,29 @@
+```python title="Python"
+import asyncio
+
+import httpx
+
+
+async def main() -> None:
+    payload = {
+        "text": "Your long text content here...",
+        "chunker_type": "text",
+        "config": {
+            "max_characters": 1000,
+            "overlap": 50,
+            "trim": True,
+        },
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post("http://localhost:8000/chunk", json=payload)
+    result = response.json()
+
+    print(f"Created {result['chunk_count']} chunks")
+    for chunk in result["chunks"]:
+        preview = chunk["content"][:50]
+        print(f"Chunk {chunk['chunk_index']}: {preview}...")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/client_extract_single_file.md
+++ b/docs/snippets/python/api/client_extract_single_file.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+import json
+
+import httpx
+
+
+async def main() -> None:
+    async with httpx.AsyncClient() as client, open("document.pdf", "rb") as f:
+        response = await client.post(
+            "http://localhost:8000/extract",
+            files={"files": f},
+        )
+    data = response.json()
+    print(json.dumps(data, indent=2))
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/combining_all_features.md
+++ b/docs/snippets/python/api/combining_all_features.md
@@ -0,0 +1,52 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    ChunkerType,
+    ImageExtractionConfig,
+    OutputFormat,
+    extract_file_sync,
+)
+
+config = ExtractionConfig(
+    # OCR: extract text from images, fallback to Tesseract
+    ocr=OcrConfig(
+        enabled=True,
+        backend="tesseract",
+        language="eng",
+    ),
+    # Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
+    chunking=ChunkingConfig(
+        max_characters=800,
+        overlap=100,
+        chunker_type=ChunkerType.Markdown,
+        prepend_heading_context=True,
+    ),
+    # Output: Markdown format with document structure preserved
+    output_format=OutputFormat.Markdown,
+    include_document_structure=True,
+    # Images: extract embedded images
+    images=ImageExtractionConfig(
+        extract_images=True,
+    ),
+    # Cache extracted results on disk
+    use_cache=True,
+)
+
+result = extract_file_sync("report.pdf", config=config)
+
+print(f"Content ({len(result.content)} chars):")
+print(result.content[:200])
+
+if result.chunks:
+    print(f"\nChunks: {len(result.chunks)}")
+
+print(f"Tables: {len(result.tables)}")
+
+if result.detected_languages:
+    print(f"Languages: {result.detected_languages}")
+
+if result.extraction_method:
+    print(f"Extraction method: {result.extraction_method}")
+```
--- a/docs/snippets/python/api/error_handling.md
+++ b/docs/snippets/python/api/error_handling.md
@@ -0,0 +1,11 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, KreuzbergError
+
+config = ExtractionConfig()
+
+try:
+    result = extract_file_sync("missing.pdf", config=config)
+except KreuzbergError as e:
+    print(f"Extraction failed: {e}")
+    raise
+```
--- a/docs/snippets/python/api/error_handling_extract.md
+++ b/docs/snippets/python/api/error_handling_extract.md
@@ -0,0 +1,27 @@
+```python title="Python"
+from kreuzberg import (
+    batch_extract_files_sync,
+    BatchFileItem,
+    ExtractionConfig,
+    KreuzbergError,
+)
+
+items = [
+    BatchFileItem(path="doc1.pdf"),
+    BatchFileItem(path="doc2.docx"),
+    BatchFileItem(path="missing.html"),
+]
+
+config = ExtractionConfig()
+
+try:
+    results = batch_extract_files_sync(items, config=config)
+    for i, result in enumerate(results):
+        if result.metadata.error:
+            print(f"Document {i}: ERROR - {result.metadata.error}")
+        else:
+            print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
+except KreuzbergError as e:
+    print(f"Batch extraction failed: {e}")
+    raise
+```
--- a/docs/snippets/python/api/extract_bytes_async.md
+++ b/docs/snippets/python/api/extract_bytes_async.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_bytes, ExtractionConfig
+
+async def main() -> None:
+    with open("document.pdf", "rb") as f:
+        content = f.read()
+
+    result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
+    print(result.content[:200])
+    print(f"Tables: {len(result.tables)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/extract_bytes_sync.md
+++ b/docs/snippets/python/api/extract_bytes_sync.md
@@ -0,0 +1,11 @@
+```python title="Python"
+from kreuzberg import extract_bytes_sync, ExtractionConfig
+
+with open("document.pdf", "rb") as f:
+    content = f.read()
+
+result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())
+
+print(result.content[:200])
+print(f"Tables: {len(result.tables)}")
+```
--- a/docs/snippets/python/api/extract_file_async.md
+++ b/docs/snippets/python/api/extract_file_async.md
@@ -0,0 +1,12 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    result = await extract_file("document.pdf", config=ExtractionConfig())
+    print(result.content[:200])
+    print(f"Tables: {len(result.tables)}")
+    print(f"Format: {result.metadata.format_type}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/extract_file_sync.md
+++ b/docs/snippets/python/api/extract_file_sync.md
@@ -0,0 +1,9 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+result = extract_file_sync("document.pdf", config=ExtractionConfig())
+
+print(result.content[:200])
+print(f"Tables: {len(result.tables)}")
+print(f"Format: {result.metadata.format_type}")
+```
--- a/docs/snippets/python/benchmarking/simple_benchmark.py
+++ b/docs/snippets/python/benchmarking/simple_benchmark.py
@@ -0,0 +1,51 @@
+```python title="simple_benchmark.py"
+import time
+import asyncio
+from pathlib import Path
+from kreuzberg import Kreuzberg, ExtractionConfig
+
+async def benchmark_extractions():
+    config = ExtractionConfig(use_cache=False)
+    kreuzberg = Kreuzberg(config)
+    file_path = "document.pdf"
+    num_runs = 10
+
+    start = time.perf_counter()
+    for _ in range(num_runs):
+        result = kreuzberg.extract_file(file_path)
+    sync_duration = time.perf_counter() - start
+    avg_sync = sync_duration / num_runs
+
+    print(f"Sync extraction ({num_runs} runs):")
+    print(f"  - Total time: {sync_duration:.3f}s")
+    print(f"  - Average: {avg_sync:.3f}s per extraction")
+
+    start = time.perf_counter()
+    tasks = [kreuzberg.extract_file_async(file_path) for _ in range(num_runs)]
+    await asyncio.gather(*tasks)
+    async_duration = time.perf_counter() - start
+
+    print(f"\nAsync extraction ({num_runs} parallel runs):")
+    print(f"  - Total time: {async_duration:.3f}s")
+    print(f"  - Average: {async_duration / num_runs:.3f}s per extraction")
+    print(f"  - Speedup: {sync_duration / async_duration:.1f}x")
+
+    cache_config = ExtractionConfig(use_cache=True)
+    kreuzberg_cached = Kreuzberg(cache_config)
+
+    print("\nFirst extraction (populates cache)...")
+    start = time.perf_counter()
+    result1 = await kreuzberg_cached.extract_file_async(file_path)
+    first_duration = time.perf_counter() - start
+    print(f"  - Time: {first_duration:.3f}s")
+
+    print("Second extraction (from cache)...")
+    start = time.perf_counter()
+    result2 = await kreuzberg_cached.extract_file_async(file_path)
+    cached_duration = time.perf_counter() - start
+    print(f"  - Time: {cached_duration:.3f}s")
+    print(f"  - Cache speedup: {first_duration / cached_duration:.1f}x")
+
+if __name__ == "__main__":
+    asyncio.run(benchmark_extractions())
+```
--- a/docs/snippets/python/cache/disk_cache.py
+++ b/docs/snippets/python/cache/disk_cache.py
@@ -0,0 +1,37 @@
+```python title="disk_cache.py"
+from pathlib import Path
+from kreuzberg import Kreuzberg, ExtractionConfig, CacheConfig
+
+cache_dir = Path.home() / ".cache" / "kreuzberg"
+cache_dir.mkdir(parents=True, exist_ok=True)
+
+config = ExtractionConfig(
+    use_cache=True,
+    cache_config=CacheConfig(
+        cache_path=str(cache_dir),
+        max_cache_size=500 * 1024 * 1024,
+        cache_ttl_seconds=7 * 86400,
+        enable_compression=True,
+    ),
+)
+
+kreuzberg = Kreuzberg(config)
+
+print("First extraction (will be cached)...")
+result1 = kreuzberg.extract_file("document.pdf")
+print(f"  - Content length: {len(result1.content)}")
+print(f"  - Cached: {result1.metadata.get('was_cached', False)}")
+
+print("\nSecond extraction (from cache)...")
+result2 = kreuzberg.extract_file("document.pdf")
+print(f"  - Content length: {len(result2.content)}")
+print(f"  - Cached: {result2.metadata.get('was_cached', False)}")
+
+print(f"\nResults are identical: {result1.content == result2.content}")
+
+cache_stats = kreuzberg.get_cache_stats()
+print(f"\nCache Statistics:")
+print(f"  - Total entries: {cache_stats.get('total_entries', 0)}")
+print(f"  - Cache size: {cache_stats.get('cache_size_bytes', 0) / 1024 / 1024:.1f} MB")
+print(f"  - Hit rate: {cache_stats.get('hit_rate', 0):.1%}")
+```
--- a/docs/snippets/python/cli/basic_cli.py
+++ b/docs/snippets/python/cli/basic_cli.py
@@ -0,0 +1,28 @@
+```python title="basic_cli.py"
+import subprocess
+import json
+import sys
+from pathlib import Path
+
+def extract_with_cli(file_path: str, output_format: str = "text") -> str:
+    result = subprocess.run(
+        ["kreuzberg", "extract", file_path, "--format", output_format],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        print(f"Error: {result.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+    if output_format == "json":
+        return json.loads(result.stdout)
+    return result.stdout
+
+document = "document.pdf"
+text_output = extract_with_cli(document, "text")
+print(f"Extracted: {len(text_output)} characters")
+
+json_output = extract_with_cli(document, "json")
+print(f"Format: {json_output.get('format', 'unknown')}")
+```
--- a/docs/snippets/python/cli/cli_with_config.py
+++ b/docs/snippets/python/cli/cli_with_config.py
@@ -0,0 +1,29 @@
+```python title="cli_with_config.py"
+import subprocess
+import json
+import sys
+from pathlib import Path
+
+def extract_with_config(file_path: str, config_path: str) -> dict:
+    result = subprocess.run(
+        ["kreuzberg", "extract", file_path, "--config", config_path, "--format", "json"],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        print(f"Error: {result.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+    return json.loads(result.stdout)
+
+config_file = Path("kreuzberg.toml")
+document = "document.pdf"
+
+print(f"Extracting {document} with config {config_file}")
+result = extract_with_config(str(document), str(config_file))
+
+print(f"Content length: {len(result.get('content', ''))}")
+print(f"Format: {result.get('format')}")
+print(f"Languages: {result.get('languages')}")
+```
--- a/docs/snippets/python/config/advanced_config.md
+++ b/docs/snippets/python/config/advanced_config.md
@@ -0,0 +1,60 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    LanguageDetectionConfig,
+    TokenReductionOptions,
+    PostProcessorConfig,
+    KeywordConfig,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True,
+        ocr=OcrConfig(
+            backend="tesseract",
+            language="eng",
+        ),
+        chunking=ChunkingConfig(
+            max_characters=1000,
+            overlap=200,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                batch_size=32,
+                normalize=True,
+            ),
+        ),
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.8,
+            detect_multiple=False,
+        ),
+        keywords=KeywordConfig(
+            algorithm="yake",
+            max_keywords=10,
+            min_score=0.1,
+            ngram_range=[1, 3],
+            language="en",
+        ),
+        token_reduction=TokenReductionOptions(
+            mode="moderate",
+            preserve_important_words=True,
+        ),
+        postprocessor=PostProcessorConfig(enabled=True),
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+    if result.detected_languages:
+        print(f"Languages: {result.detected_languages}")
+    if result.chunks:
+        print(f"Chunks: {len(result.chunks)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/chunking_config.md
+++ b/docs/snippets/python/config/chunking_config.md
@@ -0,0 +1,80 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=1000,
+            max_overlap=200,
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Chunks: {len(result.chunks or [])}")
+    for chunk in result.chunks or []:
+        print(f"Length: {len(chunk.content)}")
+
+asyncio.run(main())
+```
+
+```python title="Python - Markdown with Heading Context"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            chunker_type="markdown",
+            max_chars=500,
+            max_overlap=50,
+            sizing_type="tokenizer",
+            sizing_model="Xenova/gpt-4o",
+        )
+    )
+    result = await extract_file("document.md", config=config)
+    for chunk in result.chunks or []:
+        heading_context = chunk.metadata.get("heading_context")
+        if heading_context:
+            headings = heading_context.get("headings", [])
+            for h in headings:
+                print(f"Heading L{h['level']}: {h['text']}")
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
+
+```python title="Python - Semantic"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(chunker_type="semantic")
+    )
+    result = await extract_file("document.pdf", config=config)
+    for chunk in result.chunks or []:
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
+
+```python title="Python - Prepend Heading Context"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            chunker_type="markdown",
+            max_chars=500,
+            max_overlap=50,
+            prepend_heading_context=True,
+        )
+    )
+    result = await extract_file("document.md", config=config)
+    for chunk in result.chunks or []:
+        # Each chunk's content is prefixed with its heading breadcrumb
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_basic.md
+++ b/docs/snippets/python/config/config_basic.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_discover.md
+++ b/docs/snippets/python/config/config_discover.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig()
+    result = await extract_file("document.pdf", config=config)
+
+    content: str = result.content
+    content_preview: str = content[:100]
+
+    print(f"Content preview: {content_preview}")
+    print(f"Total length: {len(content)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_ocr.md
+++ b/docs/snippets/python/config/config_ocr.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        ocr=OcrConfig(
+            backend="tesseract", language="eng+fra",
+            tesseract_config=TesseractConfig(psm=3)
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_programmatic.md
+++ b/docs/snippets/python/config/config_programmatic.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    use_cache=True,
+    ocr=OcrConfig(backend="tesseract", language="eng"),
+    chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
+)
+
+result = extract_file_sync("document.pdf", config=config)
+content_length: int = len(result.content)
+print(f"Content length: {content_length}")
+```
--- a/docs/snippets/python/config/document_structure_config.md
+++ b/docs/snippets/python/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```python title="Document Structure Config (Python)"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+# Enable document structure output
+config = ExtractionConfig(include_document_structure=True)
+
+result = extract_file_sync("document.pdf", config=config)
+
+# Access the document tree
+if result.document:
+    for node in result.document["nodes"]:
+        node_type = node["content"]["node_type"]
+        text = node["content"].get("text", "")
+        print(f"[{node_type}] {text[:80]}")
+```
--- a/docs/snippets/python/config/element_based_output.md
+++ b/docs/snippets/python/config/element_based_output.md
@@ -0,0 +1,29 @@
+```python title="Element-Based Output (Python)"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+# Configure element-based output
+config = ExtractionConfig(result_format="element_based")
+
+# Extract document
+result = extract_file_sync("document.pdf", config=config)
+
+# Access elements
+for element in result.elements:
+    print(f"Type: {element.element_type}")
+    print(f"Text: {element.text[:100]}")
+
+    if element.metadata.page_number:
+        print(f"Page: {element.metadata.page_number}")
+
+    if element.metadata.coordinates:
+        coords = element.metadata.coordinates
+        print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
+
+    print("---")
+
+# Filter by element type
+titles = [e for e in result.elements if e.element_type == "title"]
+for title in titles:
+    level = title.metadata.additional.get("level", "unknown")
+    print(f"[{level}] {title.text}")
+```
--- a/docs/snippets/python/config/embedding_config.md
+++ b/docs/snippets/python/config/embedding_config.md
@@ -0,0 +1,15 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1000,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("all-mpnet-base-v2"),
+            batch_size=16,
+            normalize=True,
+            show_download_progress=True
+        )
+    )
+)
+```
--- a/docs/snippets/python/config/embedding_config.py
+++ b/docs/snippets/python/config/embedding_config.py
@@ -0,0 +1,83 @@
+from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
+
+# Example 1: Preset model (recommended)
+# Fast, balanced, or quality preset configurations optimized for common use cases.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.preset("balanced"),
+    batch_size=32,
+    normalize=True,
+    show_download_progress=True,
+    cache_dir="~/.cache/kreuzberg/embeddings",
+)
+
+# Available presets:
+# - "fast" (384 dims): Quick prototyping, development, resource-constrained
+# - "balanced" (768 dims): Production, general-purpose RAG, English documents
+# - "quality" (1024 dims): Complex documents, maximum accuracy
+# - "multilingual" (768 dims): International documents, 100+ languages
+
+
+# Example 2: Custom ONNX model (requires embeddings feature)
+# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.custom(
+        model_id="BAAI/bge-small-en-v1.5",
+        dimensions=384,
+    ),
+    batch_size=32,
+    normalize=True,
+    show_download_progress=True,
+    cache_dir=None,  # Uses default: .kreuzberg/embeddings/
+)
+
+# Popular ONNX-compatible models:
+# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+
+# Example 3: Alternative Custom Model
+# For advanced users wanting alternative ONNX embedding models.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.custom(
+        model_id="sentence-transformers/all-mpnet-base-v2",
+        dimensions=768,
+    ),
+    batch_size=16,  # Larger model requires smaller batch size
+    normalize=True,
+    show_download_progress=True,
+    cache_dir="/var/cache/embeddings",
+)
+
+
+# Integration with ChunkingConfig
+# Add embeddings to your chunking configuration:
+chunking_with_embeddings = ChunkingConfig(
+    max_chars=1024,
+    max_overlap=100,
+    preset="balanced",
+    embedding=EmbeddingConfig(),  # Uses balanced preset
+)
+
+extraction_config = ExtractionConfig(
+    chunking=chunking_with_embeddings,
+)
+
+
+# Key parameter explanations:
+#
+# batch_size: Number of texts to embed at once (32-128 typical)
+#   - Larger batches are faster but use more memory
+#   - Smaller batches for resource-constrained environments
+#
+# normalize: Whether to normalize vectors (L2 norm)
+#   - True (recommended): Enables cosine similarity in vector DBs
+#   - False: Raw embedding values
+#
+# cache_dir: Where to store downloaded models
+#   - None: Uses .kreuzberg/embeddings/ in current directory
+#   - String path: Custom directory for model storage
+#
+# show_download_progress: Display download progress bar
+#   - Useful for monitoring large model downloads
--- a/docs/snippets/python/config/hierarchy_config.py
+++ b/docs/snippets/python/config/hierarchy_config.py
@@ -0,0 +1,72 @@
+from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
+
+# Example 1: Basic hierarchy extraction
+# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
+# Extract bounding box information for spatial layout awareness.
+hierarchy_config_basic = HierarchyConfig(
+    enabled=True,
+    k_clusters=6,  # Default: creates 6 font size clusters (H1-H6 structure)
+    include_bbox=True,  # Include bounding box coordinates
+    ocr_coverage_threshold=None  # No OCR coverage threshold
+)
+
+pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
+extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
+
+result = extract_file_sync("document.pdf", config=extraction_config_basic)
+
+
+# Example 2: Custom k_clusters for minimal structure
+# Use 3 clusters for simpler hierarchy with minimal structure.
+# Useful when you only need major section divisions (Main, Subsection, Detail).
+hierarchy_config_minimal = HierarchyConfig(
+    enabled=True,
+    k_clusters=3,  # Minimal clustering: just 3 levels
+    include_bbox=True,
+    ocr_coverage_threshold=None
+)
+
+pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
+extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
+
+result = extract_file_sync("document.pdf", config=extraction_config_minimal)
+
+
+# Example 3: With OCR coverage threshold
+# Trigger OCR if less than 50% of text has font data.
+# Useful for documents with mixed digital and scanned content.
+hierarchy_config_ocr = HierarchyConfig(
+    enabled=True,
+    k_clusters=6,
+    include_bbox=True,
+    ocr_coverage_threshold=0.5  # Trigger OCR if text coverage < 50%
+)
+
+pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
+extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
+
+result = extract_file_sync("document.pdf", config=extraction_config_ocr)
+
+
+# Field descriptions:
+#
+# enabled: bool (default: True)
+#   - Enable or disable hierarchy extraction
+#   - When False, hierarchy structure is not analyzed
+#
+# k_clusters: int (default: 6, valid: 1-7)
+#   - Number of font size clusters for hierarchy levels
+#   - 6 provides H1-H6 heading levels with body text
+#   - Higher values create more fine-grained hierarchy
+#   - Lower values create simpler structure
+#
+# include_bbox: bool (default: True)
+#   - Include bounding box coordinates in hierarchy blocks
+#   - Required for spatial layout awareness and document structure
+#   - Set to False only if space optimization is critical
+#
+# ocr_coverage_threshold: float | None (default: None)
+#   - Range: 0.0 to 1.0
+#   - Triggers OCR when text block coverage falls below this fraction
+#   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+#   - None means no OCR coverage-based triggering
--- a/docs/snippets/python/config/html_output.md
+++ b/docs/snippets/python/config/html_output.md
@@ -0,0 +1,17 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config = ExtractionConfig(
+        output_format="html",
+        html_output={
+            "theme": "github",
+            "embed_css": True,
+        },
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)  # HTML with kb-* classes and GitHub theme
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/keyword_config.py
+++ b/docs/snippets/python/config/keyword_config.py
@@ -0,0 +1,75 @@
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    KeywordConfig,
+    KeywordAlgorithm,
+    YakeParams,
+    RakeParams,
+    extract_file,
+)
+
+
+# Example 1: Basic YAKE configuration
+# Uses YAKE algorithm with default parameters and English stopword filtering
+async def basic_yake() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.0,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=None,
+            rake_params=None,
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+# Example 2: Advanced YAKE with custom parameters
+# Fine-tunes YAKE with custom window size for co-occurrence analysis
+async def advanced_yake() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=15,
+            min_score=0.1,
+            ngram_range=(1, 2),
+            language="en",
+            yake_params=YakeParams(
+                window_size=1,
+            ),
+            rake_params=None,
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+# Example 3: RAKE configuration
+# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+async def rake_config() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.RAKE,
+            max_keywords=10,
+            min_score=5.0,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=None,
+            rake_params=RakeParams(
+                min_word_length=1,
+                max_words_per_phrase=3,
+            ),
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+if __name__ == "__main__":
+    asyncio.run(basic_yake())
--- a/docs/snippets/python/config/keyword_extraction_config.md
+++ b/docs/snippets/python/config/keyword_extraction_config.md
@@ -0,0 +1,24 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    KeywordConfig,
+    KeywordAlgorithm,
+    extract_file,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.3,
+            ngram_range=(1, 3),
+            language="en"
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content extracted: {len(result.content)} chars")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/language_detection_config.md
+++ b/docs/snippets/python/config/language_detection_config.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.85,
+            detect_multiple=False
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    if result.detected_languages:
+        print(f"Primary language: {result.detected_languages[0]}")
+    print(f"Content length: {len(result.content)} chars")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/ocr_dpi_config.md
+++ b/docs/snippets/python/config/ocr_dpi_config.md
@@ -0,0 +1,26 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    TesseractConfig,
+    ImagePreprocessingConfig,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        tesseract_config=TesseractConfig(
+            preprocessing=ImagePreprocessingConfig(target_dpi=300),
+        ),
+    ),
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content_length: int = len(result.content)
+table_count: int = len(result.tables)
+
+print(f"Content length: {content_length} characters")
+print(f"Tables detected: {table_count}")
+```
--- a/docs/snippets/python/config/pdf_config.md
+++ b/docs/snippets/python/config/pdf_config.md
@@ -0,0 +1,18 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        pdf_options=PdfConfig(
+            extract_images=True,
+            extract_metadata=True,
+            passwords=["password1", "password2"],
+            hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/pdf_hierarchy_config.md
+++ b/docs/snippets/python/config/pdf_hierarchy_config.md
@@ -0,0 +1,22 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    pdf_options=PdfConfig(
+        extract_metadata=True,
+        hierarchy=HierarchyConfig(
+            enabled=True,
+            k_clusters=6,
+            include_bbox=True,
+            ocr_coverage_threshold=0.8
+        )
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+# Access hierarchy information
+for page in result.pages or []:
+    print(f"Page {page.page_number}:")
+    print(f"  Content: {page.content[:100]}...")
+```
--- a/docs/snippets/python/config/postprocessor_config.md
+++ b/docs/snippets/python/config/postprocessor_config.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        postprocessor=PostProcessorConfig(
+            enabled=True,
+            enabled_processors=["deduplication"],
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/quality_processing_config.md
+++ b/docs/snippets/python/config/quality_processing_config.md
@@ -0,0 +1,15 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+
+    quality_score: float = result.quality_score or 0.0
+    print(f"Quality score: {quality_score:.2f}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/tesseract_config.md
+++ b/docs/snippets/python/config/tesseract_config.md
@@ -0,0 +1,21 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        ocr=OcrConfig(
+            language="eng+fra+deu",
+            tesseract_config=TesseractConfig(
+                psm=6,
+                oem=1,
+                min_confidence=0.8,
+                enable_table_detection=True,
+            ),
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/token_reduction_config.md
+++ b/docs/snippets/python/config/token_reduction_config.md
@@ -0,0 +1,10 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, TokenReductionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    token_reduction=TokenReductionConfig(
+        mode="moderate",
+        preserve_important_words=True,
+    )
+)
+```
--- a/docs/snippets/python/docker/usage.py
+++ b/docs/snippets/python/docker/usage.py
@@ -0,0 +1,58 @@
+```python title="usage.py"
+import subprocess
+import httpx
+import json
+from pathlib import Path
+
+class DockerKreuzbergClient:
+    def __init__(self, container_name: str = "kreuzberg-api", port: int = 8000):
+        self.container_name = container_name
+        self.port = port
+        self.api_url = f"http://localhost:{port}/api/extract"
+
+    def start_container(self, image: str = "kreuzberg:latest"):
+        print("Starting Kreuzberg Docker container...")
+        subprocess.run(
+            [
+                "docker", "run", "-d",
+                "--name", self.container_name,
+                "-p", f"{self.port}:8000",
+                image,
+            ],
+            check=True,
+        )
+        print(f"Container started on http://localhost:{self.port}")
+
+    async def extract_file(self, file_path: str) -> str:
+        file_bytes = Path(file_path).read_bytes()
+        files = {"file": (Path(file_path).name, file_bytes)}
+
+        async with httpx.AsyncClient() as client:
+            response = await client.post(self.api_url, files=files)
+            response.raise_for_status()
+            result = response.json()
+            return result.get("content", "")
+
+    def stop_container(self):
+        print("Stopping Kreuzberg Docker container...")
+        subprocess.run(["docker", "stop", self.container_name], check=True)
+        subprocess.run(["docker", "rm", self.container_name], check=True)
+        print("Container stopped and removed")
+
+async def main():
+    docker_client = DockerKreuzbergClient()
+
+    try:
+        docker_client.start_container()
+        import asyncio
+        await asyncio.sleep(2)
+
+        content = await docker_client.extract_file("document.pdf")
+        print(f"Extracted content:\n{content}")
+    finally:
+        docker_client.stop_container()
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/basic_usage.md
+++ b/docs/snippets/python/getting-started/basic_usage.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/extract_file.md
+++ b/docs/snippets/python/getting-started/extract_file.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from pathlib import Path
+from kreuzberg import extract_file
+
+async def main() -> None:
+    file_path: Path = Path("document.pdf")
+
+    result = await extract_file(file_path)
+
+    print(f"Content: {result.content}")
+    print(f"Format: {result.metadata.format.format_type if result.metadata.format else None}")
+    print(f"Tables: {len(result.tables)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/extract_with_ocr.md
+++ b/docs/snippets/python/getting-started/extract_with_ocr.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, OcrConfig, TesseractConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        force_ocr=True,
+        ocr=OcrConfig(
+            backend="tesseract",
+            language="eng",
+            tesseract_config=TesseractConfig(psm=3)
+        )
+    )
+    result = await extract_file("scanned.pdf", config=config)
+    print(result.content)
+    print(f"Detected Languages: {result.detected_languages}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/hello_world.md
+++ b/docs/snippets/python/getting-started/hello_world.md
@@ -0,0 +1,10 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file
+
+async def main() -> None:
+    result = await extract_file("document.pdf")
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/install_verify.md
+++ b/docs/snippets/python/getting-started/install_verify.md
@@ -0,0 +1,12 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, __version__
+
+async def main() -> None:
+    print(f"Kreuzberg version: {__version__}")
+
+    result = await extract_file("document.pdf")
+    print(f"Extraction successful: {len(result.content) > 0}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/getting-started/read_content.md
+++ b/docs/snippets/python/getting-started/read_content.md
@@ -0,0 +1,17 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file
+
+async def main() -> None:
+    result = await extract_file("document.pdf")
+
+    content: str = result.content
+    tables: int = len(result.tables)
+    format_type: str | None = result.metadata.format.format_type if result.metadata.format else None
+
+    print(f"Content length: {len(content)} characters")
+    print(f"Tables found: {tables}")
+    print(f"Format: {format_type}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/llm/structured_extraction.md
+++ b/docs/snippets/python/llm/structured_extraction.md
@@ -0,0 +1,27 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, StructuredExtractionConfig, LlmConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        structured_extraction=StructuredExtractionConfig(
+            schema={
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "authors": {"type": "array", "items": {"type": "string"}},
+                    "date": {"type": "string"},
+                },
+                "required": ["title", "authors", "date"],
+                "additionalProperties": False,
+            },
+            llm=LlmConfig(model="openai/gpt-4o-mini"),
+            strict=True,
+        ),
+    )
+    result = await extract_file("paper.pdf", config=config)
+    print(result.structured_output)
+    # {"title": "...", "authors": ["..."], "date": "..."}
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/llm/vlm_embeddings.md
+++ b/docs/snippets/python/llm/vlm_embeddings.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import embed, EmbeddingConfig, EmbeddingModelType, LlmConfig
+
+async def main() -> None:
+    config = EmbeddingConfig(
+        model=EmbeddingModelType.llm(
+            LlmConfig(model="openai/text-embedding-3-small")
+        ),
+        normalize=True,
+    )
+    embeddings = await embed(["Hello world"], config=config)
+    print(len(embeddings[0]))  # 1536
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/llm/vlm_ocr.md
+++ b/docs/snippets/python/llm/vlm_ocr.md
@@ -0,0 +1,17 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, OcrConfig, LlmConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        force_ocr=True,
+        ocr=OcrConfig(
+            backend="vlm",
+            vlm_config=LlmConfig(model="openai/gpt-4o-mini"),
+        ),
+    )
+    result = await extract_file("scan.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/mcp/mcp_custom_client.md
+++ b/docs/snippets/python/mcp/mcp_custom_client.md
@@ -0,0 +1,23 @@
+```python title="Python"
+import asyncio
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+async def main() -> None:
+    server_params: StdioServerParameters = StdioServerParameters(
+        command="kreuzberg", args=["mcp"]
+    )
+
+    async with stdio_client(server_params) as (read, write):
+        async with ClientSession(read, write) as session:
+            await session.initialize()
+            tools = await session.list_tools()
+            tool_names: list[str] = [t.name for t in tools.tools]
+            print(f"Available tools: {tool_names}")
+            result = await session.call_tool(
+                "extract_file", arguments={"path": "document.pdf", "async": True}
+            )
+            print(result)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/mcp/mcp_langchain_integration.md
+++ b/docs/snippets/python/mcp/mcp_langchain_integration.md
@@ -0,0 +1,36 @@
+```python title="Python"
+from langchain.agents import initialize_agent, AgentType
+from langchain.tools import Tool
+from langchain_openai import ChatOpenAI
+import subprocess
+import json
+
+mcp_process = subprocess.Popen(
+    ["kreuzberg", "mcp"],
+    stdin=subprocess.PIPE,
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+)
+
+def extract_file(path: str) -> str:
+    request: dict = {
+        "method": "tools/call",
+        "params": {
+            "name": "extract_file",
+            "arguments": {"path": path, "async": True},
+        },
+    }
+    mcp_process.stdin.write(json.dumps(request).encode() + b"\n")
+    mcp_process.stdin.flush()
+    response = mcp_process.stdout.readline()
+    return json.loads(response)["result"]["content"]
+
+tools: list[Tool] = [
+    Tool(name="extract_document", func=extract_file, description="Extract")
+]
+
+llm = ChatOpenAI(temperature=0)
+agent = initialize_agent(
+    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
+)
+```
--- a/docs/snippets/python/mcp/mcp_server_start.md
+++ b/docs/snippets/python/mcp/mcp_server_start.md
@@ -0,0 +1,17 @@
+```python title="Python"
+import subprocess
+import time
+from typing import Optional
+
+mcp_process: subprocess.Popen = subprocess.Popen(
+    ["python", "-m", "kreuzberg", "mcp"],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+)
+
+pid: Optional[int] = mcp_process.pid
+print(f"MCP server started with PID: {pid}")
+
+time.sleep(1)
+print("Server is running, listening for connections")
+```
--- a/docs/snippets/python/metadata/language_detection.md
+++ b/docs/snippets/python/metadata/language_detection.md
@@ -0,0 +1,13 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig
+
+config = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(
+        enabled=True,
+        min_confidence=0.9,
+        detect_multiple=True,
+    ),
+)
+
+print(config.language_detection)
+```
--- a/docs/snippets/python/metadata/language_detection_multilingual.md
+++ b/docs/snippets/python/metadata/language_detection_multilingual.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
+
+config = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(
+        enabled=True,
+        min_confidence=0.8,
+        detect_multiple=True,
+    ),
+)
+
+result = extract_file_sync("multilingual_document.pdf", config=config)
+
+if result.detected_languages:
+    print(f"Detected languages: {', '.join(result.detected_languages)}")
+```
--- a/docs/snippets/python/metadata/metadata.md
+++ b/docs/snippets/python/metadata/metadata.md
@@ -0,0 +1,80 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+result = extract_file_sync("document.pdf", config=ExtractionConfig())
+
+# Metadata is flat — format-specific fields are at the top level
+metadata = result.metadata
+if metadata.get("page_count"):
+    print(f"Pages: {metadata['page_count']}")
+if metadata.get("title"):
+    print(f"Title: {metadata['title']}")
+if metadata.get("authors"):
+    print(f"Authors: {', '.join(metadata['authors'])}")
+
+result = extract_file_sync("page.html", config=ExtractionConfig())
+metadata = result.metadata
+if metadata.get("title"):
+    print(f"Title: {metadata['title']}")
+if metadata.get("description"):
+    print(f"Description: {metadata['description']}")
+
+# Access keywords as array
+keywords = metadata.get('keywords', [])
+if keywords:
+    print(f"Keywords: {', '.join(keywords)}")
+
+# Access canonical URL (renamed from canonical)
+canonical_url = metadata.get('canonical_url')
+if canonical_url:
+    print(f"Canonical URL: {canonical_url}")
+
+# Access Open Graph fields from map
+open_graph = metadata.get('open_graph', {})
+if open_graph:
+    if 'image' in open_graph:
+        print(f"Open Graph Image: {open_graph['image']}")
+    if 'title' in open_graph:
+        print(f"Open Graph Title: {open_graph['title']}")
+    if 'type' in open_graph:
+        print(f"Open Graph Type: {open_graph['type']}")
+
+# Access Twitter Card fields from map
+twitter_card = metadata.get('twitter_card', {})
+if twitter_card:
+    if 'card' in twitter_card:
+        print(f"Twitter Card Type: {twitter_card['card']}")
+    if 'creator' in twitter_card:
+        print(f"Twitter Creator: {twitter_card['creator']}")
+
+# Access new fields
+language = metadata.get('language')
+if language:
+    print(f"Language: {language}")
+
+text_direction = metadata.get('text_direction')
+if text_direction:
+    print(f"Text Direction: {text_direction}")
+
+# Access headers
+headers = metadata.get('headers', [])
+if headers:
+    print(f"Headers: {', '.join([h['text'] for h in headers])}")
+
+# Access links
+links = metadata.get('links', [])
+if links:
+    for link in links:
+        print(f"Link: {link.get('href')} ({link.get('text')})")
+
+# Access images
+images = metadata.get('images', [])
+if images:
+    for image in images:
+        print(f"Image: {image.get('src')}")
+
+# Access structured data
+structured_data = metadata.get('structured_data', [])
+if structured_data:
+    print(f"Structured data items: {len(structured_data)}")
+```
--- a/docs/snippets/python/metadata/page_boundaries.md
+++ b/docs/snippets/python/metadata/page_boundaries.md
@@ -0,0 +1,17 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+result = extract_file_sync("document.pdf", config=ExtractionConfig())
+
+if result.metadata.pages and result.metadata.pages.boundaries:
+    boundaries = result.metadata.pages.boundaries
+    content_bytes = result.content.encode("utf-8")
+
+    for boundary in boundaries[:3]:
+        page_bytes = content_bytes[boundary.byte_start:boundary.byte_end]
+        page_text = page_bytes.decode("utf-8")
+
+        print(f"Page {boundary.page_number}:")
+        print(f"  Byte range: {boundary.byte_start}-{boundary.byte_end}")
+        print(f"  Preview: {page_text[:100]}...")
+```
--- a/docs/snippets/python/metadata/page_tracking_basic.md
+++ b/docs/snippets/python/metadata/page_tracking_basic.md
@@ -0,0 +1,14 @@
+From Kreuzberg import extract_file_sync, ExtractionConfig, PageConfig
+
+Config = ExtractionConfig(
+pages=PageConfig(extract_pages=True)
+)
+
+Result = extract_file_sync("document.pdf", config=config)
+
+If result.pages:
+for page in result.pages:
+print(f"Page {page.page_number}:")
+print(f" Content: {len(page.content)} chars")
+print(f" Tables: {len(page.tables)}")
+print(f" Images: {len(page.images)}")
--- a/docs/snippets/python/metadata/pdf_metadata_extractor.md
+++ b/docs/snippets/python/metadata/pdf_metadata_extractor.md
@@ -0,0 +1,39 @@
+```python title="Python"
+from kreuzberg import register_post_processor, ExtractionResult
+import logging
+
+logger = logging.getLogger(__name__)
+
+class PdfMetadataExtractor:
+    def __init__(self):
+        self.processed_count: int = 0
+
+    def name(self) -> str:
+        return "pdf_metadata_extractor"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def description(self) -> str:
+        return "Extracts and enriches PDF metadata"
+
+    def processing_stage(self) -> str:
+        return "early"
+
+    def should_process(self, result: ExtractionResult) -> bool:
+        return result.mime_type == "application/pdf"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        self.processed_count += 1
+        result.metadata["pdf_processed"] = True
+        return result
+
+    def initialize(self) -> None:
+        logger.info("PDF metadata extractor initialized")
+
+    def shutdown(self) -> None:
+        logger.info(f"Processed {self.processed_count} PDFs")
+
+processor: PdfMetadataExtractor = PdfMetadataExtractor()
+register_post_processor(processor)
+```
--- a/docs/snippets/python/metadata/vector_database_integration.md
+++ b/docs/snippets/python/metadata/vector_database_integration.md
@@ -0,0 +1,35 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_characters=512,
+        overlap=50,
+        embedding=EmbeddingConfig(
+            normalize=True,
+            batch_size=32,
+            preset="balanced",
+        ),
+    ),
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+records: list[dict] = []
+if result.chunks:
+    for index, chunk in enumerate(result.chunks):
+        if chunk.embedding is None:
+            continue
+        records.append({
+            "id": f"document_chunk_{index}",
+            "content": chunk.content,
+            "embedding": chunk.embedding,
+            "metadata": {
+                "document_id": "document.pdf",
+                "chunk_index": index,
+                "content_length": len(chunk.content),
+            },
+        })
+
+print(f"Prepared {len(records)} vector records")
+```
--- a/docs/snippets/python/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/python/ocr/cloud_ocr_backend.md
@@ -0,0 +1,37 @@
+```python title="Python"
+from kreuzberg import register_ocr_backend
+import httpx
+
+class CloudOcrBackend:
+    def __init__(self, api_key: str):
+        self.api_key: str = api_key
+        self.langs: list[str] = ["eng", "deu", "fra"]
+
+    def name(self) -> str:
+        return "cloud-ocr"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def supported_languages(self) -> list[str]:
+        return self.langs
+
+    def process_image(self, image_bytes: bytes, config: dict) -> dict:
+        with httpx.Client() as client:
+            response = client.post(
+                "https://api.example.com/ocr",
+                files={"image": image_bytes},
+                json={"language": config.get("language", "eng")},
+            )
+            text: str = response.json()["text"]
+            return {"content": text, "mime_type": "text/plain"}
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+backend: CloudOcrBackend = CloudOcrBackend(api_key="your-api-key")
+register_ocr_backend(backend)
+```
--- a/docs/snippets/python/ocr/image_extraction.md
+++ b/docs/snippets/python/ocr/image_extraction.md
@@ -0,0 +1,17 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, ImageExtractionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    images=ImageExtractionConfig(
+        extract_images=True,
+        target_dpi=200,
+        max_image_dimension=2048,
+        inject_placeholders=True,  # set to False to extract images without markdown references
+        auto_adjust_dpi=True,
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+print(f"Content length: {len(result.content)} characters")
+```
--- a/docs/snippets/python/ocr/image_preprocessing.md
+++ b/docs/snippets/python/ocr/image_preprocessing.md
@@ -0,0 +1,29 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    ImagePreprocessingConfig,
+    OcrConfig,
+    TesseractConfig,
+)
+
+preprocessing: ImagePreprocessingConfig = ImagePreprocessingConfig(
+    target_dpi=300,
+    denoise=True,
+    deskew=True,
+    contrast_enhance=True,
+    binarization_method="otsu",
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        language="eng",
+        tesseract_config=TesseractConfig(preprocessing=preprocessing),
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+print(f"Content length: {len(result.content)} characters")
+```
--- a/docs/snippets/python/ocr/ocr_easyocr.md
+++ b/docs/snippets/python/ocr/ocr_easyocr.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="easyocr", language="en")
+)
+
+# EasyOCR-specific options (use_gpu, beam_width, etc.) go in easyocr_kwargs,
+# not in OcrConfig — OcrConfig only accepts backend, language, and backend-specific configs.
+result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True})
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_elements.md
+++ b/docs/snippets/python/ocr/ocr_elements.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="paddleocr", language="en")
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+if result.ocr_elements:
+    for element in result.ocr_elements:
+        print(f"Text: {element.text}")
+        print(f"Confidence: {element.confidence.recognition:.2f}")
+        print(f"Geometry: {element.geometry}")
+        if element.rotation:
+            print(f"Rotation: {element.rotation.angle}°")
+        print()
+```
--- a/docs/snippets/python/ocr/ocr_extraction.md
+++ b/docs/snippets/python/ocr/ocr_extraction.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract", language="eng")
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/python/ocr/ocr_force_all_pages.md
@@ -0,0 +1,17 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract"),
+    force_ocr=True,
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_multi_language.md
+++ b/docs/snippets/python/ocr/ocr_multi_language.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract", language="eng+deu+fra")
+)
+
+result = extract_file_sync("multilingual.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_paddleocr.md
+++ b/docs/snippets/python/ocr/ocr_paddleocr.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="paddleocr", language="en")  # model_tier="server" for max accuracy
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/plugins/clear_plugins.md
+++ b/docs/snippets/python/plugins/clear_plugins.md
@@ -0,0 +1,15 @@
+```python title="Python"
+from kreuzberg import (
+    clear_document_extractors,
+    clear_post_processors,
+    clear_ocr_backends,
+    clear_validators,
+)
+
+clear_post_processors()
+clear_validators()
+clear_ocr_backends()
+clear_document_extractors()
+
+print("All plugins cleared")
+```
--- a/docs/snippets/python/plugins/embedding_backend.md
+++ b/docs/snippets/python/plugins/embedding_backend.md
@@ -0,0 +1,44 @@
+```python title="Python"
+from kreuzberg import register_embedding_backend, EmbeddingConfig, embed_texts
+from sentence_transformers import SentenceTransformer
+
+# Wrap an already-loaded embedder (e.g. sentence-transformers, llama-cpp-python,
+# or a tuned ONNX session) so kreuzberg can call back into it during chunking
+# and standalone embed requests.
+class MyEmbedder:
+    def __init__(self):
+        self._model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+
+    # Plugin trait hooks
+    def name(self) -> str:
+        return "my-embedder"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def initialize(self) -> None:
+        # Optional warm-up; runs once at registration before dimensions() is cached.
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+    # EmbeddingBackend hooks
+    def dimensions(self) -> int:
+        # Captured once at registration; the dispatcher uses this for shape validation.
+        return self._model.get_sentence_embedding_dimension()
+
+    def embed(self, texts: list[str]) -> list[list[float]]:
+        return self._model.encode(texts).tolist()
+
+
+# Register once at startup. Reference by name in config.
+register_embedding_backend(MyEmbedder())
+
+config: EmbeddingConfig = {
+    "model": {"type": "plugin", "name": "my-embedder"},
+    # Optional: bound the wait on a hung backend (default: 60s; None disables)
+    "max_embed_duration_secs": 30,
+}
+vectors = embed_texts(["Hello, world!", "Second text"], config)
+```
--- a/docs/snippets/python/plugins/extractor_registration.md
+++ b/docs/snippets/python/plugins/extractor_registration.md
@@ -0,0 +1,14 @@
+```python title="Python"
+from kreuzberg import register_document_extractor
+
+class CustomExtractor:
+    def name(self) -> str:
+        return "custom"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+extractor = CustomExtractor()
+register_document_extractor(extractor)
+print("Extractor registered")
+```
--- a/docs/snippets/python/plugins/list_plugins.md
+++ b/docs/snippets/python/plugins/list_plugins.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import (
+    list_document_extractors,
+    list_post_processors,
+    list_ocr_backends,
+    list_validators,
+)
+
+extractors: list[str] = list_document_extractors()
+processors: list[str] = list_post_processors()
+ocr_backends: list[str] = list_ocr_backends()
+validators: list[str] = list_validators()
+
+print(f"Extractors: {extractors}")
+print(f"Processors: {processors}")
+print(f"OCR backends: {ocr_backends}")
+print(f"Validators: {validators}")
+```
--- a/docs/snippets/python/plugins/min_length_validator.md
+++ b/docs/snippets/python/plugins/min_length_validator.md
@@ -0,0 +1,33 @@
+```python title="Python"
+from kreuzberg import register_validator, ExtractionResult, ValidationError
+
+class MinLengthValidator:
+    def __init__(self, min_length: int = 100):
+        self.min_length: int = min_length
+
+    def name(self) -> str:
+        return "min_length_validator"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def priority(self) -> int:
+        return 100
+
+    def validate(self, result: ExtractionResult) -> None:
+        content_len: int = len(result.content)
+        if content_len < self.min_length:
+            raise ValidationError(f"Content too short: {content_len}")
+
+    def should_validate(self, result: ExtractionResult) -> bool:
+        return True
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+validator: MinLengthValidator = MinLengthValidator(min_length=100)
+register_validator(validator)
+```
--- a/docs/snippets/python/plugins/pdf_metadata_extractor.md
+++ b/docs/snippets/python/plugins/pdf_metadata_extractor.md
@@ -0,0 +1,42 @@
+```python title="Python"
+from kreuzberg import register_post_processor, ExtractionResult
+import logging
+
+logger = logging.getLogger(__name__)
+
+class PdfMetadataExtractor:
+    def __init__(self):
+        self.processed_count: int = 0
+
+    def name(self) -> str:
+        return "pdf_metadata_extractor"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def description(self) -> str:
+        return "Extracts and enriches PDF metadata"
+
+    def processing_stage(self) -> str:
+        return "early"
+
+    def should_process(self, result: ExtractionResult) -> bool:
+        return result.mime_type == "application/pdf"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        self.processed_count += 1
+        result.metadata["pdf_processed"] = True
+        result.metadata["pdf_order"] = self.processed_count
+        result.metadata["content_length"] = len(result.content)
+        result.metadata["pdf_processor_version"] = "1.0.0"
+        return result
+
+    def initialize(self) -> None:
+        logger.info("PDF metadata extractor initialized")
+
+    def shutdown(self) -> None:
+        logger.info(f"Processed {self.processed_count} PDFs")
+
+processor: PdfMetadataExtractor = PdfMetadataExtractor()
+register_post_processor(processor)
+```
--- a/docs/snippets/python/plugins/pdf_only_processor.md
+++ b/docs/snippets/python/plugins/pdf_only_processor.md
@@ -0,0 +1,19 @@
+```python title="Python"
+from kreuzberg import ExtractionResult, register_post_processor
+
+class PdfOnlyProcessor:
+    def name(self) -> str:
+        return "pdf-only-processor"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        return result
+
+    def should_process(self, result: ExtractionResult) -> bool:
+        return result.mime_type == "application/pdf"
+
+processor: PdfOnlyProcessor = PdfOnlyProcessor()
+register_post_processor(processor)
+```
--- a/docs/snippets/python/plugins/plugin_extractor.md
+++ b/docs/snippets/python/plugins/plugin_extractor.md
@@ -0,0 +1,42 @@
+```python title="Python"
+from kreuzberg import register_document_extractor, ExtractionResult
+import json
+
+class CustomJsonExtractor:
+    def name(self) -> str:
+        return "custom-json-extractor"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def supported_mime_types(self) -> list[str]:
+        return ["application/json"]
+
+    def priority(self) -> int:
+        return 50
+
+    def extract_bytes(
+        self, content: bytes, mime_type: str, config: dict
+    ) -> ExtractionResult:
+        data: dict = json.loads(content)
+        text: str = self._extract_text(data)
+        return {"content": text, "mime_type": "application/json"}
+
+    def _extract_text(self, obj: object) -> str:
+        if isinstance(obj, str):
+            return f"{obj}\n"
+        if isinstance(obj, list):
+            return "".join(self._extract_text(item) for item in obj)
+        if isinstance(obj, dict):
+            return "".join(self._extract_text(v) for v in obj.values())
+        return ""
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+extractor: CustomJsonExtractor = CustomJsonExtractor()
+register_document_extractor(extractor)
+```
--- a/docs/snippets/python/plugins/plugin_logging.md
+++ b/docs/snippets/python/plugins/plugin_logging.md
@@ -0,0 +1,27 @@
+```python title="Python"
+import logging
+
+logger = logging.getLogger(__name__)
+
+class MyPlugin:
+    def name(self) -> str:
+        return "my-plugin"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def initialize(self) -> None:
+        logger.info(f"Initializing plugin: {self.name()}")
+
+    def shutdown(self) -> None:
+        logger.info(f"Shutting down plugin: {self.name()}")
+
+    def extract_bytes(
+        self, content: bytes, mime_type: str, config: dict
+    ) -> dict:
+        logger.info(f"Extracting {mime_type} ({len(content)} bytes)")
+        result: dict = {"content": "", "mime_type": mime_type}
+        if not result["content"]:
+            logger.warning("Extraction resulted in empty content")
+        return result
+```
--- a/docs/snippets/python/plugins/plugin_testing.md
+++ b/docs/snippets/python/plugins/plugin_testing.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import pytest
+from kreuzberg import ExtractionResult
+
+def test_custom_extractor() -> None:
+    extractor = CustomJsonExtractor()
+    json_data: bytes = b'{"message": "Hello, world!"}'
+    config: dict = {}
+    result: ExtractionResult = extractor.extract_bytes(
+        json_data, "application/json", config
+    )
+    assert "Hello, world!" in result.content
+    assert result.mime_type == "application/json"
+```
--- a/docs/snippets/python/plugins/plugin_validator.md
+++ b/docs/snippets/python/plugins/plugin_validator.md
@@ -0,0 +1,34 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionResult,
+    ValidationError,
+    extract_file_sync,
+    register_validator,
+)
+
+class MinLengthValidator:
+    def name(self) -> str:
+        return "min_length"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def validate(self, result: ExtractionResult) -> None:
+        if len(result.content) < 50:
+            raise ValidationError(f"Content too short: {len(result.content)}")
+
+    def should_validate(self, result: ExtractionResult) -> bool:
+        return True
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+validator: MinLengthValidator = MinLengthValidator()
+register_validator(validator)
+
+result = extract_file_sync("document.pdf")
+print(f"Content length: {len(result.content)}")
+```
--- a/docs/snippets/python/plugins/quality_score_validator.md
+++ b/docs/snippets/python/plugins/quality_score_validator.md
@@ -0,0 +1,20 @@
+```python title="Python"
+from kreuzberg import ExtractionResult, ValidationError, register_validator
+
+class QualityValidator:
+    def name(self) -> str:
+        return "quality-validator"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def validate(self, result: ExtractionResult) -> None:
+        score: float = result.metadata.get("quality_score", 0.0)
+        if score < 0.5:
+            raise ValidationError(
+                f"Quality score too low: {score:.2f}"
+            )
+
+validator: QualityValidator = QualityValidator()
+register_validator(validator)
+```
--- a/docs/snippets/python/plugins/stateful_plugin.md
+++ b/docs/snippets/python/plugins/stateful_plugin.md
@@ -0,0 +1,28 @@
+```python title="Python"
+import threading
+from kreuzberg import ExtractionResult
+
+class StatefulPlugin:
+    def __init__(self):
+        self.lock: threading.Lock = threading.Lock()
+        self.call_count: int = 0
+        self.cache: dict = {}
+
+    def name(self) -> str:
+        return "stateful-plugin"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        with self.lock:
+            self.call_count += 1
+            self.cache["last_mime"] = result.mime_type
+        return result
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+```
--- a/docs/snippets/python/plugins/unregister_plugins.md
+++ b/docs/snippets/python/plugins/unregister_plugins.md
@@ -0,0 +1,20 @@
+```python title="Python"
+from kreuzberg import (
+    unregister_document_extractor,
+    unregister_post_processor,
+    unregister_ocr_backend,
+    unregister_validator,
+)
+
+names: list[str] = [
+    "custom-json-extractor",
+    "word_count",
+    "cloud-ocr",
+    "min_length_validator",
+]
+
+unregister_document_extractor(names[0])
+unregister_post_processor(names[1])
+unregister_ocr_backend(names[2])
+unregister_validator(names[3])
+```
--- a/docs/snippets/python/plugins/word_count_processor.md
+++ b/docs/snippets/python/plugins/word_count_processor.md
@@ -0,0 +1,30 @@
+```python title="Python"
+from kreuzberg import register_post_processor, ExtractionResult
+
+class WordCountProcessor:
+    def name(self) -> str:
+        return "word_count"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def processing_stage(self) -> str:
+        return "early"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        word_count: int = len(result.content.split())
+        result.metadata["word_count"] = word_count
+        return result
+
+    def should_process(self, result: ExtractionResult) -> bool:
+        return bool(result.content)
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+processor: WordCountProcessor = WordCountProcessor()
+register_post_processor(processor)
+```
--- a/docs/snippets/python/utils/chunking.md
+++ b/docs/snippets/python/utils/chunking.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1500,
+        max_overlap=200,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("all-minilm-l6-v2")
+        ),
+    )
+)
+```
--- a/docs/snippets/python/utils/chunking_rag.md
+++ b/docs/snippets/python/utils/chunking_rag.md
@@ -0,0 +1,36 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=500,
+            max_overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                normalize=True,
+                batch_size=16
+            )
+        )
+    )
+    result = await extract_file("research_paper.pdf", config=config)
+
+    chunks_with_embeddings: list = []
+    for chunk in result.chunks or []:
+        if chunk.embedding:
+            chunks_with_embeddings.append({
+                "content": chunk.content[:100],
+                "embedding_dims": len(chunk.embedding)
+            })
+
+    print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/embedding_with_chunking.md
+++ b/docs/snippets/python/utils/embedding_with_chunking.md
@@ -0,0 +1,21 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1024,
+        max_overlap=100,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("balanced"),
+            normalize=True,
+            batch_size=32,
+            show_download_progress=False,
+        ),
+    )
+)
+```
--- a/Show More
+++ b/Show More