Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/config/advanced_config.md
+++ b/docs/snippets/python/config/advanced_config.md
@@ -0,0 +1,60 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    LanguageDetectionConfig,
+    TokenReductionOptions,
+    PostProcessorConfig,
+    KeywordConfig,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True,
+        ocr=OcrConfig(
+            backend="tesseract",
+            language="eng",
+        ),
+        chunking=ChunkingConfig(
+            max_characters=1000,
+            overlap=200,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                batch_size=32,
+                normalize=True,
+            ),
+        ),
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.8,
+            detect_multiple=False,
+        ),
+        keywords=KeywordConfig(
+            algorithm="yake",
+            max_keywords=10,
+            min_score=0.1,
+            ngram_range=[1, 3],
+            language="en",
+        ),
+        token_reduction=TokenReductionOptions(
+            mode="moderate",
+            preserve_important_words=True,
+        ),
+        postprocessor=PostProcessorConfig(enabled=True),
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+    if result.detected_languages:
+        print(f"Languages: {result.detected_languages}")
+    if result.chunks:
+        print(f"Chunks: {len(result.chunks)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/chunking_config.md
+++ b/docs/snippets/python/config/chunking_config.md
@@ -0,0 +1,80 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=1000,
+            max_overlap=200,
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Chunks: {len(result.chunks or [])}")
+    for chunk in result.chunks or []:
+        print(f"Length: {len(chunk.content)}")
+
+asyncio.run(main())
+```
+
+```python title="Python - Markdown with Heading Context"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            chunker_type="markdown",
+            max_chars=500,
+            max_overlap=50,
+            sizing_type="tokenizer",
+            sizing_model="Xenova/gpt-4o",
+        )
+    )
+    result = await extract_file("document.md", config=config)
+    for chunk in result.chunks or []:
+        heading_context = chunk.metadata.get("heading_context")
+        if heading_context:
+            headings = heading_context.get("headings", [])
+            for h in headings:
+                print(f"Heading L{h['level']}: {h['text']}")
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
+
+```python title="Python - Semantic"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(chunker_type="semantic")
+    )
+    result = await extract_file("document.pdf", config=config)
+    for chunk in result.chunks or []:
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
+
+```python title="Python - Prepend Heading Context"
+import asyncio
+from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            chunker_type="markdown",
+            max_chars=500,
+            max_overlap=50,
+            prepend_heading_context=True,
+        )
+    )
+    result = await extract_file("document.md", config=config)
+    for chunk in result.chunks or []:
+        # Each chunk's content is prefixed with its heading breadcrumb
+        print(f"Content: {chunk.content[:100]}...")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_basic.md
+++ b/docs/snippets/python/config/config_basic.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    config = ExtractionConfig(
+        use_cache=True,
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_discover.md
+++ b/docs/snippets/python/config/config_discover.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig()
+    result = await extract_file("document.pdf", config=config)
+
+    content: str = result.content
+    content_preview: str = content[:100]
+
+    print(f"Content preview: {content_preview}")
+    print(f"Total length: {len(content)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_ocr.md
+++ b/docs/snippets/python/config/config_ocr.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        ocr=OcrConfig(
+            backend="tesseract", language="eng+fra",
+            tesseract_config=TesseractConfig(psm=3)
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/config_programmatic.md
+++ b/docs/snippets/python/config/config_programmatic.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    use_cache=True,
+    ocr=OcrConfig(backend="tesseract", language="eng"),
+    chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
+)
+
+result = extract_file_sync("document.pdf", config=config)
+content_length: int = len(result.content)
+print(f"Content length: {content_length}")
+```
--- a/docs/snippets/python/config/document_structure_config.md
+++ b/docs/snippets/python/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```python title="Document Structure Config (Python)"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+# Enable document structure output
+config = ExtractionConfig(include_document_structure=True)
+
+result = extract_file_sync("document.pdf", config=config)
+
+# Access the document tree
+if result.document:
+    for node in result.document["nodes"]:
+        node_type = node["content"]["node_type"]
+        text = node["content"].get("text", "")
+        print(f"[{node_type}] {text[:80]}")
+```
--- a/docs/snippets/python/config/element_based_output.md
+++ b/docs/snippets/python/config/element_based_output.md
@@ -0,0 +1,29 @@
+```python title="Element-Based Output (Python)"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+# Configure element-based output
+config = ExtractionConfig(result_format="element_based")
+
+# Extract document
+result = extract_file_sync("document.pdf", config=config)
+
+# Access elements
+for element in result.elements:
+    print(f"Type: {element.element_type}")
+    print(f"Text: {element.text[:100]}")
+
+    if element.metadata.page_number:
+        print(f"Page: {element.metadata.page_number}")
+
+    if element.metadata.coordinates:
+        coords = element.metadata.coordinates
+        print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
+
+    print("---")
+
+# Filter by element type
+titles = [e for e in result.elements if e.element_type == "title"]
+for title in titles:
+    level = title.metadata.additional.get("level", "unknown")
+    print(f"[{level}] {title.text}")
+```
--- a/docs/snippets/python/config/embedding_config.md
+++ b/docs/snippets/python/config/embedding_config.md
@@ -0,0 +1,15 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1000,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("all-mpnet-base-v2"),
+            batch_size=16,
+            normalize=True,
+            show_download_progress=True
+        )
+    )
+)
+```
--- a/docs/snippets/python/config/embedding_config.py
+++ b/docs/snippets/python/config/embedding_config.py
@@ -0,0 +1,83 @@
+from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
+
+# Example 1: Preset model (recommended)
+# Fast, balanced, or quality preset configurations optimized for common use cases.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.preset("balanced"),
+    batch_size=32,
+    normalize=True,
+    show_download_progress=True,
+    cache_dir="~/.cache/kreuzberg/embeddings",
+)
+
+# Available presets:
+# - "fast" (384 dims): Quick prototyping, development, resource-constrained
+# - "balanced" (768 dims): Production, general-purpose RAG, English documents
+# - "quality" (1024 dims): Complex documents, maximum accuracy
+# - "multilingual" (768 dims): International documents, 100+ languages
+
+
+# Example 2: Custom ONNX model (requires embeddings feature)
+# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.custom(
+        model_id="BAAI/bge-small-en-v1.5",
+        dimensions=384,
+    ),
+    batch_size=32,
+    normalize=True,
+    show_download_progress=True,
+    cache_dir=None,  # Uses default: .kreuzberg/embeddings/
+)
+
+# Popular ONNX-compatible models:
+# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+
+# Example 3: Alternative Custom Model
+# For advanced users wanting alternative ONNX embedding models.
+embedding_config = EmbeddingConfig(
+    model=EmbeddingModelType.custom(
+        model_id="sentence-transformers/all-mpnet-base-v2",
+        dimensions=768,
+    ),
+    batch_size=16,  # Larger model requires smaller batch size
+    normalize=True,
+    show_download_progress=True,
+    cache_dir="/var/cache/embeddings",
+)
+
+
+# Integration with ChunkingConfig
+# Add embeddings to your chunking configuration:
+chunking_with_embeddings = ChunkingConfig(
+    max_chars=1024,
+    max_overlap=100,
+    preset="balanced",
+    embedding=EmbeddingConfig(),  # Uses balanced preset
+)
+
+extraction_config = ExtractionConfig(
+    chunking=chunking_with_embeddings,
+)
+
+
+# Key parameter explanations:
+#
+# batch_size: Number of texts to embed at once (32-128 typical)
+#   - Larger batches are faster but use more memory
+#   - Smaller batches for resource-constrained environments
+#
+# normalize: Whether to normalize vectors (L2 norm)
+#   - True (recommended): Enables cosine similarity in vector DBs
+#   - False: Raw embedding values
+#
+# cache_dir: Where to store downloaded models
+#   - None: Uses .kreuzberg/embeddings/ in current directory
+#   - String path: Custom directory for model storage
+#
+# show_download_progress: Display download progress bar
+#   - Useful for monitoring large model downloads
--- a/docs/snippets/python/config/hierarchy_config.py
+++ b/docs/snippets/python/config/hierarchy_config.py
@@ -0,0 +1,72 @@
+from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
+
+# Example 1: Basic hierarchy extraction
+# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
+# Extract bounding box information for spatial layout awareness.
+hierarchy_config_basic = HierarchyConfig(
+    enabled=True,
+    k_clusters=6,  # Default: creates 6 font size clusters (H1-H6 structure)
+    include_bbox=True,  # Include bounding box coordinates
+    ocr_coverage_threshold=None  # No OCR coverage threshold
+)
+
+pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
+extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
+
+result = extract_file_sync("document.pdf", config=extraction_config_basic)
+
+
+# Example 2: Custom k_clusters for minimal structure
+# Use 3 clusters for simpler hierarchy with minimal structure.
+# Useful when you only need major section divisions (Main, Subsection, Detail).
+hierarchy_config_minimal = HierarchyConfig(
+    enabled=True,
+    k_clusters=3,  # Minimal clustering: just 3 levels
+    include_bbox=True,
+    ocr_coverage_threshold=None
+)
+
+pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
+extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
+
+result = extract_file_sync("document.pdf", config=extraction_config_minimal)
+
+
+# Example 3: With OCR coverage threshold
+# Trigger OCR if less than 50% of text has font data.
+# Useful for documents with mixed digital and scanned content.
+hierarchy_config_ocr = HierarchyConfig(
+    enabled=True,
+    k_clusters=6,
+    include_bbox=True,
+    ocr_coverage_threshold=0.5  # Trigger OCR if text coverage < 50%
+)
+
+pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
+extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
+
+result = extract_file_sync("document.pdf", config=extraction_config_ocr)
+
+
+# Field descriptions:
+#
+# enabled: bool (default: True)
+#   - Enable or disable hierarchy extraction
+#   - When False, hierarchy structure is not analyzed
+#
+# k_clusters: int (default: 6, valid: 1-7)
+#   - Number of font size clusters for hierarchy levels
+#   - 6 provides H1-H6 heading levels with body text
+#   - Higher values create more fine-grained hierarchy
+#   - Lower values create simpler structure
+#
+# include_bbox: bool (default: True)
+#   - Include bounding box coordinates in hierarchy blocks
+#   - Required for spatial layout awareness and document structure
+#   - Set to False only if space optimization is critical
+#
+# ocr_coverage_threshold: float | None (default: None)
+#   - Range: 0.0 to 1.0
+#   - Triggers OCR when text block coverage falls below this fraction
+#   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+#   - None means no OCR coverage-based triggering
--- a/docs/snippets/python/config/html_output.md
+++ b/docs/snippets/python/config/html_output.md
@@ -0,0 +1,17 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config = ExtractionConfig(
+        output_format="html",
+        html_output={
+            "theme": "github",
+            "embed_css": True,
+        },
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(result.content)  # HTML with kb-* classes and GitHub theme
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/keyword_config.py
+++ b/docs/snippets/python/config/keyword_config.py
@@ -0,0 +1,75 @@
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    KeywordConfig,
+    KeywordAlgorithm,
+    YakeParams,
+    RakeParams,
+    extract_file,
+)
+
+
+# Example 1: Basic YAKE configuration
+# Uses YAKE algorithm with default parameters and English stopword filtering
+async def basic_yake() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.0,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=None,
+            rake_params=None,
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+# Example 2: Advanced YAKE with custom parameters
+# Fine-tunes YAKE with custom window size for co-occurrence analysis
+async def advanced_yake() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=15,
+            min_score=0.1,
+            ngram_range=(1, 2),
+            language="en",
+            yake_params=YakeParams(
+                window_size=1,
+            ),
+            rake_params=None,
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+# Example 3: RAKE configuration
+# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+async def rake_config() -> None:
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.RAKE,
+            max_keywords=10,
+            min_score=5.0,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=None,
+            rake_params=RakeParams(
+                min_word_length=1,
+                max_words_per_phrase=3,
+            ),
+        )
+    )
+
+    result = await extract_file("document.pdf", config=config)
+    print(f"Keywords: {result.keywords}")
+
+
+if __name__ == "__main__":
+    asyncio.run(basic_yake())
--- a/docs/snippets/python/config/keyword_extraction_config.md
+++ b/docs/snippets/python/config/keyword_extraction_config.md
@@ -0,0 +1,24 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    KeywordConfig,
+    KeywordAlgorithm,
+    extract_file,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.3,
+            ngram_range=(1, 3),
+            language="en"
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content extracted: {len(result.content)} chars")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/language_detection_config.md
+++ b/docs/snippets/python/config/language_detection_config.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.85,
+            detect_multiple=False
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    if result.detected_languages:
+        print(f"Primary language: {result.detected_languages[0]}")
+    print(f"Content length: {len(result.content)} chars")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/ocr_dpi_config.md
+++ b/docs/snippets/python/config/ocr_dpi_config.md
@@ -0,0 +1,26 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    OcrConfig,
+    TesseractConfig,
+    ImagePreprocessingConfig,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        tesseract_config=TesseractConfig(
+            preprocessing=ImagePreprocessingConfig(target_dpi=300),
+        ),
+    ),
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content_length: int = len(result.content)
+table_count: int = len(result.tables)
+
+print(f"Content length: {content_length} characters")
+print(f"Tables detected: {table_count}")
+```
--- a/docs/snippets/python/config/pdf_config.md
+++ b/docs/snippets/python/config/pdf_config.md
@@ -0,0 +1,18 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        pdf_options=PdfConfig(
+            extract_images=True,
+            extract_metadata=True,
+            passwords=["password1", "password2"],
+            hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/pdf_hierarchy_config.md
+++ b/docs/snippets/python/config/pdf_hierarchy_config.md
@@ -0,0 +1,22 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    pdf_options=PdfConfig(
+        extract_metadata=True,
+        hierarchy=HierarchyConfig(
+            enabled=True,
+            k_clusters=6,
+            include_bbox=True,
+            ocr_coverage_threshold=0.8
+        )
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+# Access hierarchy information
+for page in result.pages or []:
+    print(f"Page {page.page_number}:")
+    print(f"  Content: {page.content[:100]}...")
+```
--- a/docs/snippets/python/config/postprocessor_config.md
+++ b/docs/snippets/python/config/postprocessor_config.md
@@ -0,0 +1,16 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        postprocessor=PostProcessorConfig(
+            enabled=True,
+            enabled_processors=["deduplication"],
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/quality_processing_config.md
+++ b/docs/snippets/python/config/quality_processing_config.md
@@ -0,0 +1,15 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        enable_quality_processing=True
+    )
+    result = await extract_file("document.pdf", config=config)
+
+    quality_score: float = result.quality_score or 0.0
+    print(f"Quality score: {quality_score:.2f}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/tesseract_config.md
+++ b/docs/snippets/python/config/tesseract_config.md
@@ -0,0 +1,21 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        ocr=OcrConfig(
+            language="eng+fra+deu",
+            tesseract_config=TesseractConfig(
+                psm=6,
+                oem=1,
+                min_confidence=0.8,
+                enable_table_detection=True,
+            ),
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/config/token_reduction_config.md
+++ b/docs/snippets/python/config/token_reduction_config.md
@@ -0,0 +1,10 @@
+```python title="Python"
+from kreuzberg import ExtractionConfig, TokenReductionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    token_reduction=TokenReductionConfig(
+        mode="moderate",
+        preserve_important_words=True,
+    )
+)
+```