Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/skills/kreuzberg/references/advanced-features.md
+++ b/skills/kreuzberg/references/advanced-features.md
@@ -0,0 +1,975 @@
+# Advanced Features Reference
+
+Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems.
+
+## Plugin System
+
+The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results.
+
+### Custom Post-Processors
+
+Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_post_processor, ExtractionResult
+
+    class MetadataEnricher:
+        def name(self) -> str:
+            return "metadata_enricher"
+
+        def process(self, result: ExtractionResult) -> ExtractionResult:
+            result.metadata["processed_by"] = "metadata_enricher"
+            result.metadata["char_count"] = len(result.content)
+            return result
+
+        def processing_stage(self) -> str:
+            # "early", "middle", or "late"
+            return "middle"
+
+        def initialize(self) -> None:
+            print("Initializing metadata enricher")
+
+        def shutdown(self) -> None:
+            print("Shutting down metadata enricher")
+
+    register_post_processor(MetadataEnricher())
+
+    # Now use extraction with the registered processor
+    from kreuzberg import extract_file_sync
+    result = extract_file_sync("document.pdf")
+    print(result.metadata["char_count"])
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node';
+
+    const enricher = {
+        name(): string {
+            return "metadata_enricher";
+        },
+
+        async process(result: ExtractionResult): Promise<ExtractionResult> {
+            result.metadata.processed_by = "metadata_enricher";
+            result.metadata.char_count = result.content.length;
+            return result;
+        },
+
+        processingStage?(): "early" | "middle" | "late" {
+            return "middle";
+        },
+
+        async initialize?(): Promise<void> {
+            console.log("Initializing metadata enricher");
+        },
+
+        async shutdown?(): Promise<void> {
+            console.log("Shutting down metadata enricher");
+        }
+    };
+
+    registerPostProcessor(enricher);
+
+    // Now use extraction with the registered processor
+    const result = await extractFile("document.pdf");
+    console.log(result.metadata.char_count);
+    ```
+
+### Custom Validators
+
+Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_validator, ExtractionResult, ValidationError
+
+    class MinimumContentValidator:
+        def name(self) -> str:
+            return "min_content_validator"
+
+        def validate(self, result: ExtractionResult) -> None:
+            if len(result.content) < 100:
+                raise ValidationError("Extracted content too short (< 100 chars)")
+
+        def priority(self) -> int:
+            # Higher priority runs first (0-1000, default 50)
+            return 100
+
+        def should_validate(self, result: ExtractionResult) -> bool:
+            # Only validate PDFs
+            return "pdf" in result.mime_type.lower()
+
+        def initialize(self) -> None:
+            pass
+
+        def shutdown(self) -> None:
+            pass
+
+    register_validator(MinimumContentValidator())
+
+    # Extraction will fail if content < 100 chars
+    result = extract_file_sync("document.pdf")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerValidator, ExtractionResult } from '@kreuzberg/node';
+
+    const validator = {
+        name(): string {
+            return "min_content_validator";
+        },
+
+        async validate(result: ExtractionResult): Promise<void> {
+            if (result.content.length < 100) {
+                throw new Error("Extracted content too short (< 100 chars)");
+            }
+        },
+
+        priority?(): number {
+            return 100;
+        },
+
+        shouldValidate?(result: ExtractionResult): boolean {
+            return result.mimeType.toLowerCase().includes("pdf");
+        },
+
+        async initialize?(): Promise<void> {},
+
+        async shutdown?(): Promise<void> {}
+    };
+
+    registerValidator(validator);
+
+    // Extraction will fail if content < 100 chars
+    const result = await extractFile("document.pdf");
+    ```
+
+### Custom OCR Backends
+
+Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_ocr_backend
+
+    class CustomOcrBackend:
+        def name(self) -> str:
+            return "custom_ocr"
+
+        def supported_languages(self) -> list[str]:
+            return ["eng", "deu", "fra", "spa"]
+
+        def process_image(self, image_bytes: bytes, language: str) -> dict:
+            # image_bytes: raw image data
+            # language: ISO 639-3 code (e.g., "eng", "deu")
+
+            # Call your OCR engine here
+            # text = my_ocr_engine.recognize(image_bytes, language)
+
+            return {
+                "content": "Extracted text from image",
+                "metadata": {"confidence": 0.95, "language": language},
+                "tables": []
+            }
+
+        def process_file(self, path: str, language: str) -> dict:
+            # Optional: custom file processing
+            # Called when extracting OCR from a file path
+            with open(path, "rb") as f:
+                image_bytes = f.read()
+            return self.process_image(image_bytes, language)
+
+        def initialize(self) -> None:
+            # Load models, initialize engine
+            pass
+
+        def shutdown(self) -> None:
+            # Clean up resources
+            pass
+
+        def version(self) -> str:
+            return "1.0.0"
+
+    register_ocr_backend(CustomOcrBackend())
+
+    # Use in extraction config
+    from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
+
+    config = ExtractionConfig(
+        ocr=OcrConfig(backend="custom_ocr", language="eng")
+    )
+    result = extract_file_sync("scanned.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const backend = {
+        name(): string {
+            return "custom_ocr";
+        },
+
+        supportedLanguages(): string[] {
+            return ["eng", "deu", "fra", "spa"];
+        },
+
+        async processImage(
+            imageBytes: Uint8Array | string,
+            language: string
+        ): Promise<{
+            content: string;
+            mime_type: string;
+            metadata: Record<string, unknown>;
+            tables: unknown[];
+        }> {
+            const buffer = typeof imageBytes === "string"
+                ? Buffer.from(imageBytes, "base64")
+                : Buffer.from(imageBytes);
+
+            // Call your OCR engine
+            // const text = await myOcrEngine.recognize(buffer, language);
+
+            return {
+                content: "Extracted text from image",
+                mime_type: "text/plain",
+                metadata: { confidence: 0.95, language },
+                tables: []
+            };
+        },
+
+        async initialize?(): Promise<void> {
+            // Load models, initialize engine
+        },
+
+        async shutdown?(): Promise<void> {
+            // Clean up resources
+        }
+    };
+
+    registerOcrBackend(backend);
+
+    // Use in extraction config
+    const config: ExtractionConfig = {
+        ocr: { backend: "custom_ocr", language: "eng" }
+    };
+    const result = await extractFile("scanned.pdf", null, config);
+    ```
+
+## Per-File Configuration in Batch Operations
+
+Use `FileExtractionConfig` to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        batch_extract_files_sync,
+        ExtractionConfig, FileExtractionConfig, OcrConfig,
+    )
+
+    config = ExtractionConfig(output_format="markdown")
+    paths = ["report.pdf", "scan.tiff"]
+    file_configs = [
+        None,  # use batch defaults
+        FileExtractionConfig(
+            force_ocr=True,
+            ocr=OcrConfig(backend="tesseract", language="deu"),
+        ),
+    ]
+    results = batch_extract_files_sync(paths, config, file_configs=file_configs)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { batchExtractFilesSync } from '@kreuzberg/node';
+
+    const results = batchExtractFilesSync(
+      ['report.pdf', 'scan.tiff'],
+      { outputFormat: 'markdown' },
+      [null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }],
+    );
+    ```
+
+All `ExtractionConfig` fields except batch-level concerns (`max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`) can be overridden. `None`/`null` fields inherit from the batch default.
+
+## Embeddings
+
+Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content.
+
+**Requirements:** ONNX Runtime 1.22.x or later
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, ChunkingConfig, EmbeddingConfig,
+        EmbeddingModelType, list_embedding_presets,
+        get_embedding_preset, extract_file_sync
+    )
+
+    # List available embedding presets
+    presets = list_embedding_presets()
+    print(f"Available presets: {presets}")  # ['balanced', 'compact', 'large']
+
+    # Get details about a preset
+    preset_info = get_embedding_preset("balanced")
+    print(f"Model: {preset_info.model_name}")
+    print(f"Dimensions: {preset_info.dimensions}")
+    print(f"Recommended chunk size: {preset_info.chunk_size}")
+
+    # Method 1: Use preset (recommended)
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=512,
+            max_overlap=100,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                normalize=True,
+                batch_size=32
+            )
+        )
+    )
+
+    # Method 2: Use specific fastembed model
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.fastembed(
+                    model="BAAI/bge-small-en-v1.5",
+                    dimensions=384
+                ),
+                normalize=True
+            )
+        )
+    )
+
+    # Method 3: Use custom ONNX model from HuggingFace
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.custom(
+                    model_id="sentence-transformers/all-MiniLM-L6-v2",
+                    dimensions=384
+                ),
+                cache_dir="/path/to/model/cache"
+            )
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access embeddings in chunks
+    for chunk in result.chunks:
+        embedding = chunk.embedding  # list[float] or None
+        print(f"Chunk: {chunk.content[:50]}...")
+        print(f"Embedding dimensions: {len(embedding) if embedding else 0}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, ChunkingConfig,
+        listEmbeddingPresets, getEmbeddingPreset,
+        extractFile
+    } from '@kreuzberg/node';
+
+    // List available embedding presets
+    const presets = listEmbeddingPresets();
+    console.log(`Available presets: ${presets}`);  // ['balanced', 'compact', 'large']
+
+    // Get details about a preset
+    const preset = getEmbeddingPreset("balanced");
+    console.log(`Model: ${preset.modelName}`);
+    console.log(`Dimensions: ${preset.dimensions}`);
+    console.log(`Recommended chunk size: ${preset.chunkSize}`);
+
+    // Method 1: Use preset (recommended)
+    const config: ExtractionConfig = {
+        chunking: {
+            maxChars: 512,
+            maxOverlap: 100,
+            embedding: {
+                model: { type: 'preset', name: 'balanced' },
+                normalize: true,
+                batchSize: 32
+            }
+        }
+    };
+
+    // Method 2: Use specific fastembed model
+    const config2: ExtractionConfig = {
+        chunking: {
+            embedding: {
+                model: {
+                    type: 'fastembed',
+                    model: 'BAAI/bge-small-en-v1.5',
+                    dimensions: 384
+                },
+                normalize: true
+            }
+        }
+    };
+
+    // Method 3: Use custom ONNX model
+    const config3: ExtractionConfig = {
+        chunking: {
+            embedding: {
+                model: {
+                    type: 'custom',
+                    modelId: 'sentence-transformers/all-MiniLM-L6-v2',
+                    dimensions: 384
+                },
+                cacheDir: '/path/to/model/cache'
+            }
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access embeddings in chunks
+    if (result.chunks) {
+        for (const chunk of result.chunks) {
+            const embedding = chunk.embedding;  // number[] | null
+            console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
+            console.log(`Embedding dimensions: ${embedding?.length ?? 0}`);
+        }
+    }
+    ```
+
+## Keyword Extraction
+
+Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, KeywordConfig, KeywordAlgorithm,
+        YakeParams, RakeParams, extract_file_sync
+    )
+
+    # YAKE algorithm (unsupervised, good for general use)
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.Yake,
+            max_keywords=15,
+            min_score=0.1,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=YakeParams(window_size=2)
+        )
+    )
+
+    # RAKE algorithm (co-occurrence based)
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.Rake,
+            max_keywords=10,
+            min_score=0.0,
+            language="en",
+            rake_params=RakeParams(
+                min_word_length=3,
+                max_words_per_phrase=3
+            )
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access extracted keywords
+    if result.keywords:
+        for keyword in result.keywords:
+            print(f"Text: {keyword.text}")
+            print(f"Score: {keyword.score}")
+            print(f"Algorithm: {keyword.algorithm}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, KeywordConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    // YAKE algorithm
+    const config: ExtractionConfig = {
+        keywords: {
+            algorithm: "yake",
+            maxKeywords: 15,
+            minScore: 0.1,
+            ngramRange: [1, 3],
+            language: "en",
+            yakeParams: {
+                windowSize: 2
+            }
+        }
+    };
+
+    // RAKE algorithm
+    const config2: ExtractionConfig = {
+        keywords: {
+            algorithm: "rake",
+            maxKeywords: 10,
+            minScore: 0.0,
+            language: "en",
+            rakeParams: {
+                minWordLength: 3,
+                maxWordsPerPhrase: 3
+            }
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access extracted keywords
+    if (result.keywords) {
+        for (const keyword of result.keywords) {
+            console.log(`Text: ${keyword.text}`);
+            console.log(`Score: ${keyword.score}`);
+            console.log(`Algorithm: ${keyword.algorithm}`);
+        }
+    }
+    ```
+
+## Language Detection
+
+Automatically detect the language(s) in documents using ISO 639-1 language codes.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, LanguageDetectionConfig,
+        extract_file_sync
+    )
+
+    # Enable language detection
+    config = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.8,
+            detect_multiple=False
+        )
+    )
+
+    result = extract_file_sync("multilingual.pdf", config=config)
+
+    # Access detected languages
+    if result.detected_languages:
+        for lang_code in result.detected_languages:
+            print(f"Detected language: {lang_code}")  # e.g., "en", "de", "fr"
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, LanguageDetectionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        languageDetection: {
+            enabled: true,
+            minConfidence: 0.8,
+            detectMultiple: false
+        }
+    };
+
+    const result = await extractFile("multilingual.pdf", null, config);
+
+    // Access detected languages
+    if (result.detectedLanguages) {
+        for (const langCode of result.detectedLanguages) {
+            console.log(`Detected language: ${langCode}`);  // e.g., "en", "de", "fr"
+        }
+    }
+    ```
+
+## Token Reduction
+
+Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, TokenReductionConfig,
+        extract_file_sync
+    )
+
+    # Light token reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="light",
+            preserve_important_words=True
+        )
+    )
+
+    # Moderate reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="moderate",
+            preserve_important_words=True
+        )
+    )
+
+    # Aggressive reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="aggressive",
+            preserve_important_words=True
+        )
+    )
+
+    # Maximum reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="maximum",
+            preserve_important_words=True
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    print(f"Reduced content length: {len(result.content)}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, TokenReductionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        tokenReduction: {
+            mode: "moderate",
+            preserveImportantWords: true
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    console.log(`Reduced content length: ${result.content.length}`);
+    ```
+
+**Token Reduction Modes:**
+
+- `off`: No reduction (default)
+- `light`: Remove extra whitespace and redundant punctuation
+- `moderate`: Also remove common filler words and some formatting
+- `aggressive`: Also remove longer stopwords and collapse similar phrases
+- `maximum`: Maximum reduction while preserving semantic content
+
+## Page Extraction
+
+Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, PageConfig,
+        extract_file_sync
+    )
+
+    config = ExtractionConfig(
+        pages=PageConfig(
+            extract_pages=True,
+            insert_page_markers=True,
+            marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
+        )
+    )
+
+    result = extract_file_sync("multi_page.pdf", config=config)
+
+    # Access per-page content
+    if result.pages:
+        for page in result.pages:
+            print(f"Page {page.page_number}:")
+            print(f"Content: {page.content[:100]}...")
+            print(f"Tables: {len(page.tables)}")
+            print(f"Images: {len(page.images)}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, PageExtractionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        pages: {
+            extractPages: true,
+            insertPageMarkers: true,
+            markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
+        }
+    };
+
+    const result = await extractFile("multi_page.pdf", null, config);
+
+    // Access per-page content
+    if (result.pages) {
+        for (const page of result.pages) {
+            console.log(`Page ${page.pageNumber}:`);
+            console.log(`Content: ${page.content.substring(0, 100)}...`);
+            console.log(`Tables: ${page.tables.length}`);
+            console.log(`Images: ${page.images.length}`);
+        }
+    }
+    ```
+
+## Element-Based Output
+
+Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync
+
+    config = ExtractionConfig(
+        result_format="element_based"
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access semantic elements
+    if result.elements:
+        for element in result.elements:
+            print(f"Type: {element.element_type}")  # title, heading, narrative_text, etc.
+            print(f"Text: {element.text}")
+            if element.metadata.get("page_number"):
+                print(f"Page: {element.metadata['page_number']}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        resultFormat: "element_based"
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access semantic elements
+    if (result.elements) {
+        for (const element of result.elements) {
+            console.log(`Type: ${element.elementType}`);
+            console.log(`Text: ${element.text}`);
+            if (element.metadata.pageNumber) {
+                console.log(`Page: ${element.metadata.pageNumber}`);
+            }
+        }
+    }
+    ```
+
+**Element Types:**
+
+- `title`: Document or section title
+- `heading`: Section headings
+- `narrative_text`: Regular paragraph text
+- `list_item`: Items in bullet/numbered lists
+- `table`: Table structures
+- `image`: Images or figures
+- `page_break`: Page boundaries
+- `code_block`: Code snippets
+- `block_quote`: Quoted text
+- `footer`: Footer content
+- `header`: Header content
+
+## Djot Content
+
+Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync
+
+    config = ExtractionConfig(
+        output_format="djot"
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    print(result.content)  # Djot-formatted content
+
+    # Access structured Djot content
+    if result.djot_content:
+        print(f"Plain text: {result.djot_content['plain_text']}")
+        print(f"Blocks: {result.djot_content['blocks']}")
+        print(f"Links: {result.djot_content['links']}")
+        print(f"Images: {result.djot_content['images']}")
+        print(f"Footnotes: {result.djot_content['footnotes']}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        outputFormat: "djot"
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    console.log(result.content);  // Djot-formatted content
+
+    // Access structured Djot content (if available)
+    if (result.djotContent) {
+        console.log(`Plain text: ${result.djotContent.plain_text}`);
+        console.log(`Blocks: ${result.djotContent.blocks}`);
+        console.log(`Links: ${result.djotContent.links}`);
+        console.log(`Images: ${result.djotContent.images}`);
+        console.log(`Footnotes: ${result.djotContent.footnotes}`);
+    }
+    ```
+
+## API Server
+
+Run Kreuzberg as an HTTP API server for integration with external services.
+
+```bash
+# Start server on default port 8000
+kreuzberg serve
+
+# Custom host and port
+kreuzberg serve --host 0.0.0.0 --port 9000
+
+# Enable CORS and other options
+kreuzberg serve --host localhost --port 8000
+```
+
+**API Endpoints:**
+
+- `POST /extract` - Extract from uploaded file
+- `POST /batch` - Batch extraction
+- `POST /detect` - Detect MIME type
+
+**Example:**
+
+```bash
+curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
+```
+
+## MCP Server
+
+Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models.
+
+```bash
+# Start MCP server with stdio transport
+kreuzberg mcp --transport stdio
+
+# Start MCP server with HTTP transport
+kreuzberg mcp --transport http --host 127.0.0.1 --port 8001
+```
+
+The MCP server exposes extraction functions to AI models, allowing them to process documents directly.
+
+## Security Limits
+
+Set resource limits to prevent abuse and control memory/file size consumption.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, extract_file_sync
+
+    config = ExtractionConfig(
+        security_limits={
+            "max_file_size": 100_000_000,      # 100 MB
+            "max_archive_files": 1000,
+            "max_text_length": 10_000_000,     # 10 MB of text
+            "max_pages": 10000,
+            "max_concurrent_extractions": 4
+        }
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        securityLimits: {
+            max_file_size: 100_000_000,        // 100 MB
+            max_archive_files: 1000,
+            max_text_length: 10_000_000,       // 10 MB of text
+            max_pages: 10000,
+            max_concurrent_extractions: 4
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    ```
+
+**Common Limits:**
+
+- `max_file_size`: Maximum input file size in bytes
+- `max_archive_files`: Maximum files in archives (zip, tar, etc.)
+- `max_text_length`: Maximum extracted text length
+- `max_pages`: Maximum number of pages to process
+- `max_concurrent_extractions`: Maximum concurrent extraction operations
+
+## Caching
+
+Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, extract_file_sync
+
+    # Enable caching (default)
+    config = ExtractionConfig(use_cache=True)
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Disable caching for a specific extraction
+    config = ExtractionConfig(use_cache=False)
+    result = extract_file_sync("document.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    // Enable caching (default)
+    const config: ExtractionConfig = { useCache: true };
+    const result = await extractFile("document.pdf", null, config);
+
+    // Disable caching
+    const config2: ExtractionConfig = { useCache: false };
+    const result2 = await extractFile("document.pdf", null, config2);
+    ```
+
+**CLI Cache Management:**
+
+```bash
+# View cache statistics
+kreuzberg cache stats
+
+# Clear all cached results
+kreuzberg cache clear
+```
+
+Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.
--- a/skills/kreuzberg/references/cli-reference.md
+++ b/skills/kreuzberg/references/cli-reference.md
@@ -0,0 +1,449 @@
+# Kreuzberg CLI Reference
+
+Comprehensive command-line interface for the Kreuzberg document intelligence library.
+
+## Installation
+
+Install from crates.io:
+
+```bash
+cargo install kreuzberg-cli
+```
+
+Or download pre-built binaries from [GitHub Releases](https://github.com/lukasmwirth/kreuzberg/releases).
+
+## Commands
+
+### extract
+
+Extract text and structure from a single document.
+
+```bash
+kreuzberg extract <path> [FLAGS]
+```
+
+**Positional Arguments**
+
+- `<path>` — Path to the document file
+
+**Flags**
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
+- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
+- `-m, --mime-type <type>` — MIME type hint (auto-detected if not provided).
+- `-f, --format <text|json>` — CLI output format (default: `text`). Controls how results display, not extraction content format.
+- `--content-format <plain|markdown|djot|html>` — Extraction content format (default: `plain`). Controls format of extracted content. (Note: `--output-format` is a deprecated alias.)
+- `--ocr <bool>` — Enable OCR processing.
+- `--ocr-backend <BACKEND>` — OCR backend: `tesseract`, `paddle-ocr`, `easyocr`.
+- `--ocr-language <LANG>` — OCR language code.
+- `--ocr-auto-rotate <bool>` — Auto-rotate images before OCR.
+- `--force-ocr <bool>` — Force OCR even if text extraction succeeds.
+- `--disable-ocr <bool>` — Disable OCR entirely (even for images).
+- `--no-cache <bool>` — Disable caching.
+- `--chunk <bool>` — Enable text chunking.
+- `--chunk-size <n>` — Chunk size in characters.
+- `--chunk-overlap <n>` — Chunk overlap in characters.
+- `--chunking-tokenizer <model>` — Tokenizer model for token-based sizing.
+- `--include-structure <bool>` — Include hierarchical document structure.
+- `--quality <bool>` — Enable quality processing.
+- `--detect-language <bool>` — Enable language detection.
+- `--layout` — Enable layout detection (RT-DETR v2). Use `--layout false` to disable.
+- `--layout-confidence <float>` — Layout confidence threshold (0.0-1.0).
+- `--layout-table-model <model>` — Table structure model: `tatr`, `slanet_wired`, `slanet_wireless`, `slanet_plus`, `slanet_auto`, `disabled`.
+- `--acceleration <provider>` — ONNX execution provider: `auto`, `cpu`, `coreml`, `cuda`, `tensorrt`.
+- `--extract-pages <bool>` — Extract pages as separate array.
+- `--page-markers <bool>` — Insert page marker comments.
+- `--extract-images <bool>` — Enable image extraction.
+- `--target-dpi <n>` — Target DPI for images (36-2400).
+- `--pdf-password <pass>` — Password for encrypted PDFs (repeatable).
+- `--pdf-extract-images <bool>` — Extract images from PDF pages.
+- `--pdf-extract-metadata <bool>` — Extract PDF metadata.
+- `--token-reduction <level>` — Token reduction: `off`, `light`, `moderate`, `aggressive`, `maximum`.
+- `--msg-codepage <n>` — Windows codepage fallback for MSG files.
+- `--max-concurrent <n>` — Max parallel extractions in batch mode.
+- `--max-threads <n>` — Cap all internal thread pools.
+- `--cache-namespace <name>` — Cache namespace for tenant isolation.
+- `--cache-ttl-secs <n>` — Per-request cache TTL in seconds.
+
+**Examples**
+
+```bash
+# Extract with default settings
+kreuzberg extract document.pdf
+
+# Extract with OCR enabled
+kreuzberg extract scanned.pdf --ocr true
+
+# Extract with specific output format
+kreuzberg extract doc.docx --output-format markdown
+
+# Extract with inline JSON config
+kreuzberg extract file.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
+
+# Extract with base64-encoded config
+kreuzberg extract file.pdf --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+
+# Extract and output as JSON
+kreuzberg extract doc.pdf --format json
+
+# Extract with chunking
+kreuzberg extract large-doc.pdf --chunk true --chunk-size 2000 --chunk-overlap 200
+
+# Layout-aware markdown extraction
+kreuzberg extract document.pdf --layout --content-format markdown
+
+# With custom confidence threshold
+kreuzberg extract document.pdf --layout-confidence 0.7 --content-format markdown
+```
+
+### batch
+
+Batch extract from multiple documents in parallel.
+
+```bash
+kreuzberg batch <paths...> [FLAGS]
+```
+
+**Positional Arguments**
+
+- `<paths...>` — One or more document file paths
+
+**Flags**
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
+- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
+- `-f, --format <text|json>` — CLI output format (default: `json`). Controls how results display, not extraction content format.
+- All extraction override flags from `extract` are also supported (e.g., `--content-format`, `--ocr`, `--layout`, `--force-ocr`, `--no-cache`, `--quality`, `--acceleration`, etc.). See the `extract` command flags for the full list.
+
+**Notes**
+
+- Batch command defaults to JSON output format (unlike `extract` which defaults to text).
+- Does not support `--mime-type` or `--detect-language` flags.
+
+**Examples**
+
+```bash
+# Batch extract multiple PDFs
+kreuzberg batch document1.pdf document2.pdf document3.pdf
+
+# Batch extract with glob patterns (shell expansion)
+kreuzberg batch *.pdf
+
+# Batch extract with custom output format
+kreuzberg batch doc1.pdf doc2.pdf --output-format markdown
+
+# Batch extract with OCR
+kreuzberg batch scanned*.pdf --ocr true
+
+# Batch extract with text output format
+kreuzberg batch files*.docx --format text
+```
+
+### detect
+
+Identify MIME type of a file.
+
+```bash
+kreuzberg detect <path> [FLAGS]
+```
+
+**Positional Arguments**
+
+- `<path>` — Path to the file
+
+**Flags**
+
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+**Examples**
+
+```bash
+# Detect MIME type (text output)
+kreuzberg detect unknown-file.bin
+
+# Detect MIME type (JSON output)
+kreuzberg detect file.xyz --format json
+```
+
+### version
+
+Display version information.
+
+```bash
+kreuzberg version [FLAGS]
+```
+
+**Flags**
+
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+**Examples**
+
+```bash
+# Show version as text
+kreuzberg version
+
+# Show version as JSON
+kreuzberg version --format json
+```
+
+### cache
+
+Manage extraction cache.
+
+#### cache stats
+
+Display cache statistics.
+
+```bash
+kreuzberg cache stats [FLAGS]
+```
+
+**Flags**
+
+- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+**Examples**
+
+```bash
+# Show cache stats
+kreuzberg cache stats
+
+# Show cache stats as JSON
+kreuzberg cache stats --format json
+
+# Show stats for specific cache directory
+kreuzberg cache stats --cache-dir /tmp/my-cache
+```
+
+#### cache clear
+
+Clear all cached extractions.
+
+```bash
+kreuzberg cache clear [FLAGS]
+```
+
+**Flags**
+
+- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+**Examples**
+
+```bash
+# Clear cache
+kreuzberg cache clear
+
+# Clear specific cache directory
+kreuzberg cache clear --cache-dir /tmp/my-cache
+```
+
+### serve
+
+Start the API server (requires `api` feature).
+
+```bash
+kreuzberg serve [FLAGS]
+```
+
+**Flags**
+
+- `-H, --host <host>` — Host to bind to (e.g., `127.0.0.1` or `0.0.0.0`). CLI arg overrides config file and environment variables.
+- `-p, --port <port>` — Port to bind to. CLI arg overrides config file and environment variables.
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+
+**Configuration Precedence**
+
+1. CLI arguments (`--host`, `--port`)
+2. Environment variables (`KREUZBERG_HOST`, `KREUZBERG_PORT`)
+3. Config file (`[server]` section)
+4. Built-in defaults (`127.0.0.1:8000`)
+
+**Examples**
+
+```bash
+# Start server with defaults
+kreuzberg serve
+
+# Start server on specific host and port
+kreuzberg serve --host 0.0.0.0 --port 3000
+
+# Start server with config file
+kreuzberg serve --config kreuzberg.toml
+
+# Start server (environment variables override defaults)
+KREUZBERG_HOST=192.168.1.100 KREUZBERG_PORT=8080 kreuzberg serve
+```
+
+### mcp
+
+Start the Model Context Protocol (MCP) server (requires `mcp` feature).
+
+```bash
+kreuzberg mcp [FLAGS]
+```
+
+**Flags**
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--transport <stdio|http>` — Transport mode (default: `stdio`)
+- `--host <host>` — HTTP host for http transport (default: `127.0.0.1`)
+- `--port <port>` — HTTP port for http transport (default: `8001`)
+
+**Examples**
+
+```bash
+# Start MCP server with stdio transport
+kreuzberg mcp
+
+# Start MCP server with HTTP transport
+kreuzberg mcp --transport http
+
+# Start MCP server on custom HTTP host/port
+kreuzberg mcp --transport http --host 0.0.0.0 --port 9000
+
+# Start MCP server with config file
+kreuzberg mcp --config kreuzberg.toml
+```
+
+## Configuration
+
+### File Format
+
+Configuration files support three formats with automatic detection:
+
+- **TOML** — `.toml` extension (recommended)
+- **YAML** — `.yaml` or `.yml` extension
+- **JSON** — `.json` extension
+
+### Configuration Precedence
+
+Settings are applied in order from highest to lowest priority:
+
+1. **Individual CLI flags** (e.g., `--ocr true`, `--output-format markdown`)
+2. **Inline JSON config** (`--config-json` or `--config-json-base64`)
+3. **Config file** (explicit `--config path.toml` or auto-discovered)
+4. **Default values** (built-in library defaults)
+
+### Auto-Discovery
+
+When no config file is specified, Kreuzberg searches for configuration in this order:
+
+1. `kreuzberg.toml` in current directory
+2. `kreuzberg.yaml` in current directory
+3. `kreuzberg.json` in current directory
+4. Parent directories (same search pattern, up to filesystem root)
+
+### Example Configuration
+
+```toml
+# Top-level extraction options
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+output_format = "markdown"
+
+# OCR settings
+[ocr]
+backend = "tesseract"
+language = "eng"
+
+# Chunking settings
+[chunking]
+max_chars = 2000
+max_overlap = 200
+
+# Language detection
+[language_detection]
+enabled = true
+
+# Server configuration (for serve command)
+[server]
+host = "127.0.0.1"
+port = 8000
+```
+
+## Exit Codes
+
+- `0` — Success
+- Non-zero — Error (see stderr for details)
+
+## Error Handling
+
+The CLI validates input and provides clear error messages:
+
+- **File not found** — Verify path exists and is readable
+- **Invalid MIME type** — Ensure file is accessible and format is supported
+- **Invalid JSON** — Check `--config-json` syntax
+- **Invalid config file** — Verify TOML/YAML/JSON format
+- **Invalid chunk parameters** — Ensure chunk-size > 0 and overlap < chunk-size
+
+## Environment Variables
+
+- `RUST_LOG` — Set logging level (e.g., `RUST_LOG=debug`)
+- `KREUZBERG_HOST` — Server bind host (used by `serve` command)
+- `KREUZBERG_PORT` — Server bind port (used by `serve` command)
+
+## Common Patterns
+
+### Extract with Custom Configuration
+
+```bash
+kreuzberg extract document.pdf \
+  --content-format markdown \
+  --ocr true \
+  --quality true
+```
+
+### Batch Process with Config File
+
+```bash
+kreuzberg batch *.pdf --config extraction-config.toml
+```
+
+### CI/CD Integration
+
+```bash
+# Extract to JSON for downstream processing
+kreuzberg extract file.pdf --format json | jq '.content'
+
+# Batch process with error handling
+kreuzberg batch docs/*.pdf --format json || exit 1
+```
+
+### Performance Tuning
+
+```bash
+# Disable cache for temporary processing
+kreuzberg extract file.pdf --no-cache true
+
+# Enable chunking for large documents
+kreuzberg extract large-file.pdf \
+  --chunk true \
+  --chunk-size 5000 \
+  --chunk-overlap 500
+```
+
+## Debugging
+
+Enable detailed logging:
+
+```bash
+RUST_LOG=debug kreuzberg extract document.pdf
+```
+
+Check cache statistics:
+
+```bash
+kreuzberg cache stats --format json
+```
+
+Detect file MIME type:
+
+```bash
+kreuzberg detect unknown-file --format json
+```
--- a/skills/kreuzberg/references/configuration.md
+++ b/skills/kreuzberg/references/configuration.md
@@ -0,0 +1,417 @@
+# Configuration Reference
+
+Kreuzberg uses a hierarchical configuration system supporting multiple formats and auto-discovery mechanisms. This reference covers all available configuration options, field names across programming languages, and loading strategies.
+
+## Supported Formats
+
+Kreuzberg configurations can be defined in three formats:
+
+- **TOML** (recommended): `kreuzberg.toml`
+- **YAML**: `kreuzberg.yaml`
+- **JSON**: `kreuzberg.json`
+
+All formats support the same schema and configuration options.
+
+## Auto-Discovery
+
+When no configuration file is explicitly specified, Kreuzberg searches for configuration files in the following order:
+
+1. Current working directory: `kreuzberg.toml`, `kreuzberg.yaml`, `kreuzberg.json`
+2. Parent directories (recursively up the tree, same file name pattern)
+
+The first matching configuration file is loaded.
+
+## Programmatic Loading
+
+### Python
+
+```python
+from kreuzberg import ExtractionConfig
+
+# Load from explicit path
+config = ExtractionConfig.from_file("kreuzberg.toml")
+
+# Auto-discover configuration
+config = ExtractionConfig.discover()
+```
+
+### Node.js / TypeScript
+
+```typescript
+import { ExtractionConfig } from "@kreuzberg/node";
+
+// Load from explicit path
+const config = ExtractionConfig.fromFile("kreuzberg.toml");
+
+// Auto-discover configuration
+const config = ExtractionConfig.discover();
+```
+
+### CLI
+
+```bash
+# Explicit configuration file
+kreuzberg extract --config kreuzberg.toml document.pdf
+
+# Auto-discovery (searches default locations)
+kreuzberg extract document.pdf
+```
+
+## Configuration Schema
+
+The complete TOML schema with all available sections and options:
+
+### Top-Level Options
+
+```toml
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+output_format = "markdown"
+result_format = "text"
+max_concurrent_extractions = 4
+```
+
+| Option                       | Type    | Default      | Description                                                                         |
+| ---------------------------- | ------- | ------------ | ----------------------------------------------------------------------------------- |
+| `use_cache`                  | boolean | `true`       | Enable caching of extraction results                                                |
+| `enable_quality_processing`  | boolean | `true`       | Enable post-processing for output quality                                           |
+| `force_ocr`                  | boolean | `false`      | Force OCR processing even for searchable PDFs                                       |
+| `disable_ocr`                | boolean | `false`      | Disable OCR entirely — image files return empty content instead of errors (v4.7.0+) |
+| `output_format`              | string  | `"markdown"` | Output format (markdown, html, text)                                                |
+| `result_format`              | string  | `"text"`     | Result format for structured output                                                 |
+| `max_concurrent_extractions` | integer | `4`          | Maximum concurrent document extractions                                             |
+
+### OCR Configuration
+
+```toml
+[ocr]
+backend = "tesseract"
+language = "eng"
+```
+
+| Option     | Type   | Default       | Description                                   |
+| ---------- | ------ | ------------- | --------------------------------------------- |
+| `backend`  | string | `"tesseract"` | OCR backend (currently tesseract)             |
+| `language` | string | `"eng"`       | ISO 639-3 language code (eng, deu, fra, etc.) |
+
+#### Tesseract Configuration
+
+```toml
+[ocr.tesseract_config]
+psm = 3
+oem = 3
+min_confidence = 0.0
+output_format = "text"
+enable_table_detection = false
+table_min_confidence = 0.5
+table_column_threshold = 50
+table_row_threshold_ratio = 0.5
+use_cache = true
+```
+
+| Option                      | Type    | Default  | Description                                |
+| --------------------------- | ------- | -------- | ------------------------------------------ |
+| `psm`                       | integer | `3`      | Page Segmentation Mode (0-13)              |
+| `oem`                       | integer | `3`      | OCR Engine Mode (0-3)                      |
+| `min_confidence`            | float   | `0.0`    | Minimum OCR confidence threshold (0.0-1.0) |
+| `output_format`             | string  | `"text"` | Output format from OCR                     |
+| `enable_table_detection`    | boolean | `false`  | Enable table detection during OCR          |
+| `table_min_confidence`      | float   | `0.5`    | Minimum confidence for table cells         |
+| `table_column_threshold`    | integer | `50`     | Pixel threshold for column detection       |
+| `table_row_threshold_ratio` | float   | `0.5`    | Row height ratio threshold                 |
+| `use_cache`                 | boolean | `true`   | Cache OCR results                          |
+
+#### Tesseract Preprocessing
+
+```toml
+[ocr.tesseract_config.preprocessing]
+target_dpi = 300
+auto_rotate = true
+deskew = true
+denoise = true
+contrast_enhance = true
+binarization_method = "otsu"
+invert_colors = false
+```
+
+| Option                | Type    | Default  | Description                                    |
+| --------------------- | ------- | -------- | ---------------------------------------------- |
+| `target_dpi`          | integer | `300`    | Target DPI for preprocessing                   |
+| `auto_rotate`         | boolean | `true`   | Automatically detect and correct page rotation |
+| `deskew`              | boolean | `true`   | Correct skewed pages                           |
+| `denoise`             | boolean | `true`   | Remove noise from images                       |
+| `contrast_enhance`    | boolean | `true`   | Enhance image contrast                         |
+| `binarization_method` | string  | `"otsu"` | Method for image binarization                  |
+| `invert_colors`       | boolean | `false`  | Invert image colors if needed                  |
+
+### PDF Options
+
+```toml
+[pdf_options]
+extract_images = true
+extract_metadata = true
+
+[pdf_options.hierarchy]
+enabled = true
+k_clusters = 6
+include_bbox = true
+ocr_coverage_threshold = 0.5
+```
+
+| Option                             | Type    | Default | Description                                    |
+| ---------------------------------- | ------- | ------- | ---------------------------------------------- |
+| `extract_images`                   | boolean | `true`  | Extract images from PDF documents              |
+| `extract_metadata`                 | boolean | `true`  | Extract PDF metadata                           |
+| `hierarchy.enabled`                | boolean | `true`  | Enable PDF hierarchy extraction (v4.0.0+)      |
+| `hierarchy.k_clusters`             | integer | `6`     | Number of clusters for hierarchy detection     |
+| `hierarchy.include_bbox`           | boolean | `true`  | Include bounding boxes in hierarchy            |
+| `hierarchy.ocr_coverage_threshold` | float   | `0.5`   | OCR coverage threshold for hierarchy (0.0-1.0) |
+
+### Image Processing
+
+```toml
+[images]
+extract_images = true
+target_dpi = 300
+max_image_dimension = 4096
+auto_adjust_dpi = true
+min_dpi = 72
+max_dpi = 600
+```
+
+| Option                | Type    | Default | Description                                  |
+| --------------------- | ------- | ------- | -------------------------------------------- |
+| `extract_images`      | boolean | `true`  | Extract images from documents                |
+| `target_dpi`          | integer | `300`   | Target DPI for image processing              |
+| `max_image_dimension` | integer | `4096`  | Maximum image dimension in pixels            |
+| `auto_adjust_dpi`     | boolean | `true`  | Automatically adjust DPI based on image size |
+| `min_dpi`             | integer | `72`    | Minimum DPI threshold                        |
+| `max_dpi`             | integer | `600`   | Maximum DPI threshold                        |
+
+### Chunking Configuration
+
+```toml
+[chunking]
+max_chars = 1000
+max_overlap = 200
+
+[chunking.embedding]
+batch_size = 32
+normalize = true
+show_download_progress = true
+cache_dir = "~/.cache/kreuzberg/embeddings"
+
+[chunking.embedding.model]
+type = "preset"
+name = "balanced"
+```
+
+| Option                             | Type    | Default                           | Description                                                |
+| ---------------------------------- | ------- | --------------------------------- | ---------------------------------------------------------- |
+| `max_chars`                        | integer | `1000`                            | Maximum characters per chunk                               |
+| `max_overlap`                      | integer | `200`                             | Overlap between consecutive chunks                         |
+| `embedding.batch_size`             | integer | `32`                              | Batch size for embedding generation                        |
+| `embedding.normalize`              | boolean | `true`                            | Normalize embeddings to unit length                        |
+| `embedding.show_download_progress` | boolean | `true`                            | Show progress when downloading models                      |
+| `embedding.cache_dir`              | string  | `"~/.cache/kreuzberg/embeddings"` | Directory for caching embeddings                           |
+| `embedding.model.type`             | string  | `"preset"`                        | Model type: preset, fastembed, or custom                   |
+| `embedding.model.name`             | string  | `"balanced"`                      | Preset model name (balanced, fast, accurate, multilingual) |
+| `embedding.model.model`            | string  |                                   | FastEmbed model identifier                                 |
+| `embedding.model.model_id`         | string  |                                   | Custom HuggingFace model ID                                |
+| `embedding.model.dimensions`       | integer |                                   | Embedding dimensions                                       |
+
+### Keywords Configuration
+
+```toml
+[keywords]
+algorithm = "yake"
+max_keywords = 10
+min_score = 0.0
+ngram_range = [1, 3]
+language = "en"
+```
+
+| Option         | Type    | Default  | Description                                 |
+| -------------- | ------- | -------- | ------------------------------------------- |
+| `algorithm`    | string  | `"yake"` | Keyword extraction algorithm (yake or rake) |
+| `max_keywords` | integer | `10`     | Maximum keywords to extract                 |
+| `min_score`    | float   | `0.0`    | Minimum relevance score for keywords        |
+| `ngram_range`  | array   | `[1, 3]` | N-gram size range [min, max]                |
+| `language`     | string  | `"en"`   | Language code for keyword extraction        |
+
+### Token Reduction
+
+```toml
+[token_reduction]
+mode = "off"
+preserve_important_words = true
+```
+
+| Option                     | Type    | Default | Description                               |
+| -------------------------- | ------- | ------- | ----------------------------------------- |
+| `mode`                     | string  | `"off"` | Mode: off, aggressive, moderate, minimal  |
+| `preserve_important_words` | boolean | `true`  | Preserve important words during reduction |
+
+### Language Detection
+
+```toml
+[language_detection]
+enabled = true
+min_confidence = 0.8
+detect_multiple = false
+```
+
+| Option            | Type    | Default | Description                                |
+| ----------------- | ------- | ------- | ------------------------------------------ |
+| `enabled`         | boolean | `true`  | Enable automatic language detection        |
+| `min_confidence`  | float   | `0.8`   | Minimum confidence threshold for detection |
+| `detect_multiple` | boolean | `false` | Detect multiple languages in document      |
+
+### Post-Processor
+
+```toml
+[postprocessor]
+enabled = true
+```
+
+| Option    | Type    | Default | Description                                 |
+| --------- | ------- | ------- | ------------------------------------------- |
+| `enabled` | boolean | `true`  | Enable post-processing of extracted content |
+
+## FileExtractionConfig (Per-File Overrides)
+
+Passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants) to override settings per file in a batch. All fields optional — `None` = use batch default. The separate `_with_configs` functions were removed in v4.5.0.
+
+**Overridable fields:** `enable_quality_processing`, `ocr`, `force_ocr`, `chunking`, `images`, `pdf_options`, `token_reduction`, `language_detection`, `pages`, `keywords`, `postprocessor`, `html_options`, `result_format`, `output_format`, `include_document_structure`, `layout`.
+
+**Batch-level only (not overridable):** `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
+
+**Merge semantics:** For each file, `FileExtractionConfig` fields are overlaid on the batch `ExtractionConfig`. `None` falls through to batch default; `Some(value)` replaces the batch default for that file.
+
+```toml
+# FileExtractionConfig cannot be specified in config files —
+# it is a programmatic API for per-file overrides at runtime.
+```
+
+## Naming Conventions
+
+Kreuzberg uses consistent naming conventions across different contexts:
+
+| Context              | Convention | Example                                       |
+| -------------------- | ---------- | --------------------------------------------- |
+| Python               | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| Node.js / TypeScript | camelCase  | `maxChars`, `pdfOptions`, `useCache`          |
+| Rust                 | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| TOML / YAML / JSON   | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| CLI flags            | kebab-case | `--max-chars`, `--pdf-options`, `--use-cache` |
+
+When switching between languages, apply the appropriate conversion:
+
+- Python → Node.js: `snake_case` to `camelCase`
+- CLI → Python: `kebab-case` to `snake_case`
+- TOML → Python: No conversion needed (both use `snake_case`)
+
+## Environment Variables
+
+The following environment variables can override configuration:
+
+| Variable         | Purpose                             | Example     |
+| ---------------- | ----------------------------------- | ----------- |
+| `KREUZBERG_HOST` | Server bind address (serve command) | `127.0.0.1` |
+| `KREUZBERG_PORT` | Server port (serve command)         | `8080`      |
+
+## Configuration Merging
+
+Configuration sources are merged in priority order (highest to lowest):
+
+1. **CLI flags** (highest priority)
+2. **Inline JSON configuration** (programmatic)
+3. **Configuration file** (lowest priority)
+
+Later sources override earlier ones. For example, a CLI flag `--max-chars 2000` overrides `max_chars = 1000` in the configuration file.
+
+## Example Configurations
+
+### Minimal Configuration
+
+```toml
+use_cache = true
+enable_quality_processing = true
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+```
+
+### High-Quality PDF Extraction
+
+```toml
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+
+[ocr.tesseract_config]
+psm = 3
+oem = 3
+enable_table_detection = true
+table_min_confidence = 0.7
+
+[pdf_options]
+extract_images = true
+extract_metadata = true
+
+[pdf_options.hierarchy]
+enabled = true
+k_clusters = 6
+
+[images]
+extract_images = true
+target_dpi = 300
+```
+
+### Semantic Search Configuration
+
+```toml
+[chunking]
+max_chars = 800
+max_overlap = 150
+
+[chunking.embedding]
+batch_size = 32
+normalize = true
+cache_dir = "~/.cache/kreuzberg/embeddings"
+
+[chunking.embedding.model]
+type = "preset"
+name = "accurate"
+
+[keywords]
+algorithm = "yake"
+max_keywords = 15
+```
+
+## Field Name Reference
+
+Critical field names to use in configuration files:
+
+- `max_chars` (NOT `max_characters`)
+- `max_overlap` (NOT `overlap`)
+- `table_min_confidence`
+- `table_column_threshold`
+- `table_row_threshold_ratio`
+- `ocr_coverage_threshold`
+- `k_clusters`
+- `include_bbox`
+- `enable_table_detection`
+- `auto_rotate`
+- `auto_adjust_dpi`
+- `show_download_progress`
+- `min_confidence`
+- `detect_multiple`
+
+Always verify field names against the source configuration file when adding new options.
--- a/skills/kreuzberg/references/nodejs-api.md
+++ b/skills/kreuzberg/references/nodejs-api.md
--- a/skills/kreuzberg/references/other-bindings.md
+++ b/skills/kreuzberg/references/other-bindings.md
@@ -0,0 +1,212 @@
+# Language Bindings Reference
+
+Kreuzberg provides native bindings for multiple programming languages, each with precompiled binaries for x86_64 and aarch64 on Linux and macOS. This reference covers installation and basic usage for each binding.
+
+## Go
+
+**Installation:**
+
+```bash
+go get github.com/kreuzberg-dev/kreuzberg/packages/go/v5
+```
+
+**Basic Extraction:**
+
+```go
+package main
+
+import (
+    "context"
+    "fmt"
+    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5/kreuzberg"
+)
+
+func main() {
+    ctx := context.Background()
+    result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
+    if err != nil {
+        panic(err)
+    }
+    fmt.Println(result.Content)
+}
+```
+
+See the [Go binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go) for complete API reference.
+
+## Ruby
+
+**Installation:**
+
+```bash
+gem install kreuzberg
+```
+
+Or in your Gemfile:
+
+```ruby
+gem 'kreuzberg'
+```
+
+**Basic Extraction:**
+
+```ruby
+require 'kreuzberg'
+
+result = Kreuzberg.extract_file_sync('document.pdf')
+puts result.content
+```
+
+See the [Ruby binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/ruby) for complete API reference.
+
+## Java
+
+**Installation:**
+Add to your Maven `pom.xml`:
+
+```xml
+<dependency>
+    <groupId>dev.kreuzberg</groupId>
+    <artifactId>kreuzberg</artifactId>
+    <version>4.2.x</version>
+</dependency>
+```
+
+**Basic Extraction:**
+
+```java
+import dev.kreuzberg.Kreuzberg;
+import dev.kreuzberg.ExtractionResult;
+
+public class Example {
+    public static void main(String[] args) throws Exception {
+        ExtractionResult result = Kreuzberg.extractFile("document.pdf");
+        System.out.println(result.getContent());
+    }
+}
+```
+
+See the [Java binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/java) for complete API reference.
+
+## C
+
+**Installation:**
+
+```bash
+dotnet add package Kreuzberg
+```
+
+**Basic Extraction:**
+
+```csharp
+using Kreuzberg;
+
+var result = KreuzbergClient.ExtractFileSync("document.pdf");
+Console.WriteLine(result.Content);
+```
+
+See the [C# binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/csharp) for complete API reference.
+
+## PHP
+
+**Installation:**
+
+```bash
+composer require kreuzberg/kreuzberg
+```
+
+**Basic Extraction:**
+
+```php
+<?php
+require 'vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+
+$kreuzberg = new Kreuzberg();
+$result = $kreuzberg->extractFile('document.pdf');
+echo $result->content;
+```
+
+See the [PHP binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/php) for complete API reference.
+
+## Elixir
+
+**Installation:**
+Add to your `mix.exs` dependencies:
+
+```elixir
+def deps do
+  [
+    kreuzberg: "~> 4.2"
+  ]
+end
+```
+
+**Basic Extraction:**
+
+```elixir
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+IO.puts(result.content)
+```
+
+See the [Elixir binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/elixir) for complete API reference.
+
+## WebAssembly (WASM)
+
+**Installation:**
+
+```bash
+npm install @kreuzberg/wasm
+```
+
+**Basic Extraction:**
+
+```typescript
+import { extractBytes } from "@kreuzberg/wasm";
+
+const fileData = await fs.promises.readFile("document.pdf");
+const result = await extractBytes(fileData, "application/pdf");
+console.log(result.content);
+```
+
+Supports browsers, Deno, and Cloudflare Workers.
+
+See the [WASM binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/typescript) for complete API reference.
+
+## Docker
+
+**Installation:**
+Pull the official image from GitHub Container Registry:
+
+```bash
+docker pull ghcr.io/kreuzberg-dev/kreuzberg
+```
+
+**API Server Mode:**
+
+```bash
+docker run -p 8000:8000 ghcr.io/kreuzberg-dev/kreuzberg serve --host 0.0.0.0
+```
+
+**CLI Mode:**
+
+```bash
+docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg extract /data/document.pdf
+```
+
+**MCP Server Mode:**
+
+```bash
+docker run -i ghcr.io/kreuzberg-dev/kreuzberg mcp
+```
+
+Image sizes:
+
+- Core image: 1.0-1.3GB
+- Full image: ~1.0-1.3GB
+
+See the [Docker guide](https://docs.kreuzberg.dev/guides/docker/) for deployment details.
+
+## Platform Support
+
+All language bindings include precompiled binaries for x86_64 and aarch64 on Linux and macOS. Windows support varies by binding. Refer to the main [README](https://github.com/kreuzberg-dev/kreuzberg) for platform compatibility matrix.
--- a/skills/kreuzberg/references/python-api.md
+++ b/skills/kreuzberg/references/python-api.md
--- a/skills/kreuzberg/references/rust-api.md
+++ b/skills/kreuzberg/references/rust-api.md
@@ -0,0 +1,866 @@
+# Kreuzberg Rust API Reference
+
+Complete API reference for the Kreuzberg document extraction library in Rust.
+
+## Setup
+
+Add to your `Cargo.toml`:
+
+```toml
+[dependencies]
+kreuzberg = { version = "4", features = [
+    "tokio-runtime",
+    "pdf",
+    "ocr",
+    "chunking",
+    "embeddings",
+    "language-detection",
+    "keywords-yake",
+    "keywords-rake",
+    "api",
+    "mcp"
+] }
+tokio = { version = "1", features = ["full"] }
+```
+
+### Core Features
+
+- **tokio-runtime**: Enables async/sync extraction (default). Required for `extract_file_sync`, `batch_extract_file_sync`, `batch_extract_file`
+- **pdf**: PDF extraction with PDFium
+- **ocr**: Tesseract-based OCR for scanned documents
+- **chunking**: Text chunking for RAG pipelines
+- **embeddings**: Vector embeddings generation
+- **language-detection**: Detect document language
+- **keywords-yake** / **keywords-rake**: Extract keywords using YAKE or RAKE
+- **api**: HTTP API with Axum
+- **mcp**: Model Context Protocol support
+
+---
+
+## Core Extraction Functions
+
+### `extract_file` (async)
+
+Extract content from a file path.
+
+```rust
+pub async fn extract_file(
+    path: impl AsRef<Path>,
+    mime_type: Option<&str>,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Requires async context (`#[tokio::main]`, `tokio::spawn`, etc.).
+
+```rust
+use kreuzberg::{extract_file, ExtractionConfig};
+use std::path::Path;
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("Content: {}", result.content);
+    Ok(())
+}
+```
+
+### `extract_bytes` (async)
+
+Extract content from byte data.
+
+```rust
+pub async fn extract_bytes(
+    data: &[u8],
+    mime_type: &str,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Requires async context.
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let pdf_bytes = std::fs::read("document.pdf")?;
+    let result = extract_bytes(&pdf_bytes, "application/pdf", &config).await?;
+    Ok(())
+}
+```
+
+### `extract_file_sync` (sync)
+
+Synchronous wrapper around `extract_file`.
+
+```rust
+pub fn extract_file_sync(
+    path: impl AsRef<Path>,
+    mime_type: Option<&str>,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Requires tokio-runtime feature.** Blocks the current thread using a global Tokio runtime.
+
+```rust
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("Content: {}", result.content);
+    Ok(())
+}
+```
+
+### `extract_bytes_sync` (sync)
+
+Synchronous wrapper around `extract_bytes`.
+
+```rust
+pub fn extract_bytes_sync(
+    content: &[u8],
+    mime_type: &str,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Works in sync and async contexts.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let bytes = b"Hello, world!";
+    let result = extract_bytes_sync(bytes, "text/plain", &config)?;
+    Ok(())
+}
+```
+
+### `batch_extract_file` (async, parallel)
+
+Extract multiple files concurrently.
+
+```rust
+pub async fn batch_extract_file(
+    paths: Vec<impl AsRef<Path>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Processes files in parallel with automatic concurrency management (defaults to `num_cpus * 1.5`).
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let paths = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
+    let results = batch_extract_file(paths, &config).await?;
+    println!("Processed {} files", results.len());
+    Ok(())
+}
+```
+
+### `batch_extract_bytes` (async, parallel)
+
+Extract multiple byte arrays concurrently.
+
+```rust
+pub async fn batch_extract_bytes(
+    contents: Vec<(Vec<u8>, String)>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Each tuple is `(bytes, mime_type)`.
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let contents = vec![
+        (b"PDF content".to_vec(), "application/pdf".to_string()),
+        (b"Text content".to_vec(), "text/plain".to_string()),
+    ];
+    let results = batch_extract_bytes(contents, &config).await?;
+    Ok(())
+}
+```
+
+### `batch_extract_file_sync` (sync, parallel)
+
+Synchronous wrapper for batch file extraction.
+
+```rust
+pub fn batch_extract_file_sync(
+    paths: Vec<impl AsRef<Path>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Uses global runtime for concurrency.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let paths = vec!["doc1.pdf", "doc2.pdf"];
+    let results = batch_extract_file_sync(paths, &config)?;
+    Ok(())
+}
+```
+
+### `batch_extract_bytes_sync` (sync, parallel)
+
+Synchronous wrapper for batch byte extraction.
+
+```rust
+pub fn batch_extract_bytes_sync(
+    contents: Vec<(Vec<u8>, String)>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Always available.** Each tuple is `(bytes, mime_type)`.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let contents = vec![
+        (b"content 1".to_vec(), "text/plain".to_string()),
+        (b"content 2".to_vec(), "text/plain".to_string()),
+    ];
+    let results = batch_extract_bytes_sync(contents, &config)?;
+    Ok(())
+}
+```
+
+### `FileExtractionConfig`
+
+Per-file overrides for batch operations, passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants). All fields `Option<T>` — `None` = use batch default.
+
+> **Note (v4.5.0):** The separate `batch_extract_file_with_configs` / `batch_extract_bytes_with_configs` functions have been removed. Per-file configs are now an optional parameter on the unified batch functions.
+
+```rust
+pub struct FileExtractionConfig {
+    pub enable_quality_processing: Option<bool>,
+    pub ocr: Option<OcrConfig>,
+    pub force_ocr: Option<bool>,
+    pub chunking: Option<ChunkingConfig>,
+    pub images: Option<ImageExtractionConfig>,
+    pub pdf_options: Option<PdfConfig>,
+    pub token_reduction: Option<TokenReductionConfig>,
+    pub language_detection: Option<LanguageDetectionConfig>,
+    pub pages: Option<PageConfig>,
+    pub postprocessor: Option<PostProcessorConfig>,
+    pub output_format: Option<OutputFormat>,
+    pub include_document_structure: Option<bool>,
+}
+```
+
+Excluded batch-level fields: `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
+
+---
+
+## Configuration
+
+### `ExtractionConfig`
+
+Main configuration struct for all extraction operations.
+
+```rust
+pub struct ExtractionConfig {
+    /// Enable caching (default: true)
+    pub use_cache: bool,
+
+    /// Enable quality post-processing (default: true)
+    pub enable_quality_processing: bool,
+
+    /// OCR configuration (None = OCR disabled)
+    pub ocr: Option<OcrConfig>,
+
+    /// Force OCR even for searchable PDFs (default: false)
+    pub force_ocr: bool,
+
+    /// Text chunking configuration (None = disabled)
+    pub chunking: Option<ChunkingConfig>,
+
+    /// Image extraction configuration (None = disabled)
+    pub images: Option<ImageExtractionConfig>,
+
+    /// PDF-specific options (requires pdf feature)
+    #[cfg(feature = "pdf")]
+    pub pdf_options: Option<PdfConfig>,
+
+    /// Token reduction configuration (None = disabled)
+    pub token_reduction: Option<TokenReductionConfig>,
+
+    /// Language detection configuration (None = disabled)
+    pub language_detection: Option<LanguageDetectionConfig>,
+
+    /// Page extraction configuration (None = disabled)
+    pub pages: Option<PageConfig>,
+
+    /// Keyword extraction configuration (requires keywords-yake or keywords-rake)
+    #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+    pub keywords: Option<KeywordConfig>,
+
+    /// Post-processor configuration (None = use defaults)
+    pub postprocessor: Option<PostProcessorConfig>,
+
+    /// HTML to Markdown conversion options (requires html feature)
+    #[cfg(feature = "html")]
+    pub html_options: Option<ConversionOptions>,
+
+    /// Maximum concurrent extractions in batch (None = num_cpus * 1.5)
+    pub max_concurrent_extractions: Option<usize>,
+
+    /// Result structure format (default: Unified)
+    /// Uses types::OutputFormat (Unified | ElementBased)
+    pub result_format: types::OutputFormat,
+
+    /// Security limits for archives (requires archives feature)
+    #[cfg(feature = "archives")]
+    pub security_limits: Option<SecurityLimits>,
+
+    /// Content output format (default: Plain)
+    /// Uses config::OutputFormat (Plain | Markdown | Djot | Html)
+    pub output_format: OutputFormat,
+}
+```
+
+#### Creating Configs
+
+```rust
+use kreuzberg::{ExtractionConfig, OcrConfig, ChunkingConfig, OutputFormat};
+
+// Default configuration
+let config = ExtractionConfig::default();
+
+// With OCR
+let config = ExtractionConfig {
+    ocr: Some(OcrConfig {
+        backend: "tesseract".to_string(),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+
+// With chunking
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 512,
+        overlap: 50,
+        ..Default::default()
+    }),
+    output_format: OutputFormat::Markdown,
+    ..Default::default()
+};
+```
+
+---
+
+## Output Formats
+
+There are two separate enums both named `OutputFormat` in different modules:
+
+### Content `OutputFormat` (`core::config::formats::OutputFormat`)
+
+Controls the format of the `content` field text. Used by `ExtractionConfig::output_format`.
+
+```rust
+pub enum OutputFormat {
+    /// Plain text (default)
+    Plain,
+    /// Markdown formatted
+    Markdown,
+    /// Djot markup format
+    Djot,
+    /// HTML format
+    Html,
+}
+```
+
+### Result `OutputFormat` (`types::extraction::OutputFormat`)
+
+Controls the result structure. Used by `ExtractionConfig::result_format`.
+
+```rust
+pub enum OutputFormat {
+    /// Unified format with all content in `content` field (default)
+    Unified,
+    /// Element-based format with semantic element extraction
+    ElementBased,
+}
+```
+
+```rust
+use kreuzberg::{ExtractionConfig, OutputFormat};
+
+let config = ExtractionConfig {
+    output_format: OutputFormat::Markdown,  // content format (Plain/Markdown/Djot/Html)
+    // result_format uses types::OutputFormat (Unified/ElementBased) — defaults to Unified
+    ..Default::default()
+};
+```
+
+---
+
+## Extraction Result
+
+### `ExtractionResult`
+
+Result returned by all extraction functions.
+
+```rust
+pub struct ExtractionResult {
+    /// Main extracted content
+    pub content: String,
+
+    /// Document MIME type
+    pub mime_type: Cow<'static, str>,
+
+    /// Metadata about extraction
+    pub metadata: Metadata,
+
+    /// Extracted tables (HTML/Markdown)
+    pub tables: Vec<Table>,
+
+    /// Detected languages (if language-detection enabled)
+    pub detected_languages: Option<Vec<String>>,
+
+    /// Text chunks (if chunking enabled)
+    pub chunks: Option<Vec<Chunk>>,
+
+    /// Extracted images (if image extraction enabled)
+    pub images: Option<Vec<ExtractedImage>>,
+
+    /// Per-page content (if page extraction enabled)
+    pub pages: Option<Vec<PageContent>>,
+
+    /// Semantic elements (if element-based format enabled)
+    pub elements: Option<Vec<Element>>,
+
+    /// Djot document structure (if extracting Djot)
+    pub djot_content: Option<DjotContent>,
+
+    /// Extracted keywords with relevance scores (if keyword extraction enabled)
+    pub extracted_keywords: Option<Vec<ExtractedKeyword>>,
+
+    /// Quality score for extraction result (0.0-1.0)
+    pub quality_score: Option<f64>,
+
+    /// Non-fatal warnings during processing pipeline
+    pub processing_warnings: Vec<ProcessingWarning>,
+}
+```
+
+### `ExtractedKeyword`
+
+Extracted keyword with relevance score and position information.
+
+```rust
+pub struct ExtractedKeyword {
+    /// Keyword text
+    pub text: String,
+
+    /// Relevance score (0.0-1.0)
+    pub score: f32,
+
+    /// Algorithm used for extraction ("tfidf", "textrank", "yake", etc.)
+    pub algorithm: String,
+
+    /// Character positions in content (if available)
+    pub positions: Option<Vec<usize>>,
+}
+```
+
+### `ProcessingWarning`
+
+Non-fatal warning encountered during document processing.
+
+```rust
+pub struct ProcessingWarning {
+    /// Component that generated the warning
+    pub source: String,
+
+    /// Warning message describing the issue
+    pub message: String,
+}
+```
+
+### `Chunk`
+
+Text chunk with optional embedding.
+
+```rust
+pub struct Chunk {
+    /// Chunk text content
+    pub content: String,
+
+    /// Optional embedding vector
+    pub embedding: Option<Vec<f32>>,
+
+    /// Chunk metadata
+    pub metadata: ChunkMetadata,
+}
+
+pub struct ChunkMetadata {
+    pub byte_start: usize,
+    pub byte_end: usize,
+    pub token_count: Option<usize>,
+    pub chunk_index: usize,
+    pub total_chunks: usize,
+    pub first_page: Option<usize>,
+    pub last_page: Option<usize>,
+}
+```
+
+### `ExtractedImage`
+
+Image extracted from document.
+
+```rust
+pub struct ExtractedImage {
+    /// Raw image bytes
+    pub data: Bytes,
+
+    /// Format: "jpeg", "png", "webp", etc.
+    pub format: Cow<'static, str>,
+
+    /// Zero-indexed position
+    pub image_index: usize,
+
+    /// Page number (1-indexed)
+    pub page_number: Option<usize>,
+
+    /// Image dimensions
+    pub width: Option<u32>,
+    pub height: Option<u32>,
+
+    /// Colorspace: "RGB", "CMYK", "Gray"
+    pub colorspace: Option<String>,
+
+    /// Bits per component
+    pub bits_per_component: Option<u32>,
+
+    /// Whether this is a mask image
+    pub is_mask: bool,
+
+    /// Image description
+    pub description: Option<String>,
+
+    /// Nested OCR result (if OCRed)
+    pub ocr_result: Option<Box<ExtractionResult>>,
+}
+```
+
+---
+
+## Error Handling
+
+### `KreuzbergError` enum
+
+```rust
+pub enum KreuzbergError {
+    /// File system errors (always bubble up)
+    Io(std::io::Error),
+
+    /// Document parsing errors
+    Parsing {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// OCR processing errors
+    Ocr {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Configuration/input validation errors
+    Validation {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Cache operation errors
+    Cache {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Image processing errors
+    ImageProcessing {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Serialization errors (JSON, MessagePack)
+    Serialization {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Missing system dependency (e.g. Tesseract)
+    MissingDependency(String),
+
+    /// Plugin-specific errors
+    Plugin {
+        message: String,
+        plugin_name: String,
+    },
+
+    /// Mutex/RwLock poisoning
+    LockPoisoned(String),
+
+    /// Unsupported MIME type or format
+    UnsupportedFormat(String),
+
+    /// Other errors
+    Other(String),
+}
+```
+
+#### Error Constructors
+
+```rust
+use kreuzberg::KreuzbergError;
+
+// Create errors
+let err = KreuzbergError::parsing("invalid PDF");
+let err = KreuzbergError::ocr("Tesseract failed");
+let err = KreuzbergError::validation("config invalid");
+let err = KreuzbergError::unsupported_format("application/unknown");
+let err = KreuzbergError::missing_dependency("tesseract");
+
+// With source
+let source = std::io::Error::new(std::io::ErrorKind::NotFound, "file missing");
+let err = KreuzbergError::parsing_with_source("corrupt PDF", source);
+```
+
+#### Handling Errors
+
+```rust
+use kreuzberg::extract_file;
+
+match extract_file("doc.pdf", None, &config).await {
+    Ok(result) => println!("Success: {}", result.content),
+    Err(kreuzberg::KreuzbergError::Io(e)) => {
+        println!("File error: {}", e);
+    }
+    Err(kreuzberg::KreuzbergError::UnsupportedFormat(fmt)) => {
+        println!("Unsupported: {}", fmt);
+    }
+    Err(e) => println!("Other error: {}", e),
+}
+```
+
+---
+
+## MIME Type Detection
+
+### `detect_mime_type`
+
+Detect MIME type from file path.
+
+```rust
+pub fn detect_mime_type(path: impl AsRef<Path>) -> Result<String>
+```
+
+```rust
+use kreuzberg::detect_mime_type;
+
+let mime = detect_mime_type("document.pdf")?;
+assert_eq!(mime, "application/pdf");
+```
+
+### `detect_mime_type_from_bytes`
+
+Detect MIME type from byte data.
+
+```rust
+pub fn detect_mime_type_from_bytes(data: &[u8]) -> Result<String>
+```
+
+### `validate_mime_type`
+
+Check if a MIME type is supported.
+
+```rust
+pub fn validate_mime_type(mime_type: &str) -> Result<()>
+```
+
+```rust
+use kreuzberg::validate_mime_type;
+
+validate_mime_type("application/pdf")?;  // OK
+validate_mime_type("application/unknown")?;  // Error
+```
+
+### `get_extensions_for_mime`
+
+Get file extensions for a MIME type.
+
+```rust
+pub fn get_extensions_for_mime(mime_type: &str) -> Vec<String>
+```
+
+```rust
+use kreuzberg::get_extensions_for_mime;
+
+let exts = get_extensions_for_mime("application/pdf");
+// ["pdf"]
+
+let exts = get_extensions_for_mime("text/plain");
+// ["txt", "text"]
+```
+
+### MIME Type Constants
+
+```rust
+use kreuzberg::{
+    PDF_MIME_TYPE,
+    PLAIN_TEXT_MIME_TYPE,
+    HTML_MIME_TYPE,
+    MARKDOWN_MIME_TYPE,
+    JSON_MIME_TYPE,
+    XML_MIME_TYPE,
+    DOCX_MIME_TYPE,
+    POWER_POINT_MIME_TYPE,
+    EXCEL_MIME_TYPE,
+};
+
+assert_eq!(PDF_MIME_TYPE, "application/pdf");
+assert_eq!(PLAIN_TEXT_MIME_TYPE, "text/plain");
+```
+
+---
+
+## Plugin Registry
+
+Access extractors, OCR backends, and validators.
+
+### `get_document_extractor_registry`
+
+Get all available document extractors.
+
+```rust
+pub fn get_document_extractor_registry() -> Arc<RwLock<DocumentExtractorRegistry>>
+```
+
+### `get_ocr_backend_registry`
+
+Get all available OCR backends.
+
+```rust
+pub fn get_ocr_backend_registry() -> Arc<RwLock<OcrBackendRegistry>>
+```
+
+### `get_post_processor_registry`
+
+Get all available post-processors.
+
+```rust
+pub fn get_post_processor_registry() -> Arc<RwLock<PostProcessorRegistry>>
+```
+
+### `get_validator_registry`
+
+Get all available validators.
+
+```rust
+pub fn get_validator_registry() -> Arc<RwLock<ValidatorRegistry>>
+```
+
+---
+
+## Complete Example
+
+```rust
+use kreuzberg::{
+    extract_file, ExtractionConfig, OutputFormat,
+    ChunkingConfig, OcrConfig, LanguageDetectionConfig,
+};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    // Configure extraction
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        chunking: Some(ChunkingConfig {
+            max_characters: 512,
+            overlap: 50,
+            ..Default::default()
+        }),
+        language_detection: Some(LanguageDetectionConfig::default()),
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: false,
+        ..Default::default()
+    };
+
+    // Extract from file
+    let result = extract_file("document.pdf", None, &config).await?;
+
+    // Use results
+    println!("Content:\n{}", result.content);
+    println!("MIME: {}", result.mime_type);
+
+    if let Some(langs) = result.detected_languages {
+        println!("Languages: {:?}", langs);
+    }
+
+    if let Some(chunks) = result.chunks {
+        println!("Chunks: {}", chunks.len());
+        for chunk in chunks {
+            println!("  - {}", &chunk.content[..50.min(chunk.content.len())]);
+        }
+    }
+
+    if let Some(images) = result.images {
+        println!("Images: {}", images.len());
+    }
+
+    if let Some(pages) = result.pages {
+        println!("Pages: {}", pages.len());
+    }
+
+    Ok(())
+}
+```
+
+---
+
+## Result Type Alias
+
+```rust
+pub type Result<T> = std::result::Result<T, KreuzbergError>;
+```
+
+All fallible operations return `Result<T>` where errors are `KreuzbergError`.
+
+---
+
+## Feature Flags Summary
+
+| Feature            | Availability | Dependencies                                   |
+| ------------------ | ------------ | ---------------------------------------------- |
+| tokio-runtime      | Default      | Tokio runtime for async/sync                   |
+| pdf                | Default      | PDFium                                         |
+| ocr                | Optional     | Tesseract                                      |
+| chunking           | Optional     | text-splitter                                  |
+| embeddings         | Optional     | FastEmbed, requires tokio-runtime              |
+| language-detection | Optional     | whatlang                                       |
+| keywords-yake      | Optional     | yake-rust                                      |
+| keywords-rake      | Optional     | rake                                           |
+| api                | Optional     | Axum, requires tokio-runtime                   |
+| mcp                | Optional     | Model Context Protocol, requires tokio-runtime |
+
+---
+
+## Version
+
+This reference is for Kreuzberg 4.x.
--- a/skills/kreuzberg/references/supported-formats.md
+++ b/skills/kreuzberg/references/supported-formats.md
@@ -0,0 +1,223 @@
+# Supported Formats Reference
+
+Kreuzberg supports 91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction. All formats support text and metadata extraction. Additional capabilities like OCR and table detection are noted per format.
+
+## Office Documents
+
+### Word Processing
+
+| Format             | Extensions               | MIME Type                                                                 | Capabilities                                                    |
+| ------------------ | ------------------------ | ------------------------------------------------------------------------- | --------------------------------------------------------------- |
+| Microsoft Word     | `.docx`                  | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` | Full text extraction, tables, embedded images, metadata, styles |
+| Word Macro-Enabled | `.docm`                  | `application/vnd.ms-word.document.macroEnabled.12`                        | Macro-enabled document extraction, metadata                     |
+| Word Template      | `.dotx`, `.dotm`, `.dot` | Various Word template MIME types                                          | Template document extraction, metadata                          |
+| OpenDocument Text  | `.odt`                   | `application/vnd.oasis.opendocument.text`                                 | Full text extraction, tables, embedded images, metadata, styles |
+
+### Spreadsheets
+
+| Format                   | Extensions | MIME Type                                                              | Capabilities                                             |
+| ------------------------ | ---------- | ---------------------------------------------------------------------- | -------------------------------------------------------- |
+| Excel Workbook           | `.xlsx`    | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`    | Sheet data, cell values, formulas, cell metadata, charts |
+| Excel Macro-Enabled      | `.xlsm`    | `application/vnd.ms-excel.sheet.macroEnabled.12`                       | Sheet data, formulas, macros (text only), metadata       |
+| Excel Binary             | `.xlsb`    | `application/vnd.ms-excel.sheet.binary.macroEnabled.12`                | Binary sheet data extraction, metadata                   |
+| Excel Legacy             | `.xls`     | `application/vnd.ms-excel`                                             | Legacy sheet data extraction, metadata                   |
+| Excel Add-in             | `.xla`     | `application/vnd.ms-excel`                                             | Add-in data extraction                                   |
+| Excel Macro Add-in       | `.xlam`    | `application/vnd.ms-excel.addin.macroEnabled.12`                       | Macro add-in metadata                                    |
+| Excel Template           | `.xltm`    | `application/vnd.ms-excel.template.macroEnabled.12`                    | Template data and metadata                               |
+| Excel Template (XML)     | `.xltx`    | `application/vnd.openxmlformats-officedocument.spreadsheetml.template` | XML template data and metadata                           |
+| Excel Template (Legacy)  | `.xlt`     | `application/vnd.ms-excel`                                             | Legacy template data extraction                          |
+| OpenDocument Spreadsheet | `.ods`     | `application/vnd.oasis.opendocument.spreadsheet`                       | Sheet data, cell values, formulas, metadata              |
+
+### Presentations
+
+| Format                  | Extensions               | MIME Type                                                                   | Capabilities                                         |
+| ----------------------- | ------------------------ | --------------------------------------------------------------------------- | ---------------------------------------------------- |
+| PowerPoint Presentation | `.pptx`                  | `application/vnd.openxmlformats-officedocument.presentationml.presentation` | Slide text, speaker notes, embedded images, metadata |
+| PowerPoint Legacy       | `.ppt`                   | `application/vnd.ms-powerpoint`                                             | Legacy slide text extraction, metadata               |
+| PowerPoint Slideshow    | `.ppsx`                  | `application/vnd.openxmlformats-officedocument.presentationml.slideshow`    | Slideshow content, speaker notes, metadata           |
+| PowerPoint Template     | `.potx`, `.potm`, `.pot` | Various PowerPoint template MIME types                                      | Template slide extraction, metadata                  |
+
+### PDF
+
+| Format                   | Extensions | MIME Type         | Capabilities                                                                                       |
+| ------------------------ | ---------- | ----------------- | -------------------------------------------------------------------------------------------------- |
+| Portable Document Format | `.pdf`     | `application/pdf` | Text extraction, tables, embedded images, metadata, OCR (when needed), password protection support |
+
+### eBooks
+
+| Format      | Extensions | MIME Type                       | Capabilities                                           |
+| ----------- | ---------- | ------------------------------- | ------------------------------------------------------ |
+| EPUB        | `.epub`    | `application/epub+zip`          | Chapter text, metadata, embedded resources, navigation |
+| FictionBook | `.fb2`     | `application/x-fictionbook+xml` | Book content, metadata, chapter structure              |
+
+### Database
+
+| Format | Extensions | MIME Type           | Capabilities                                          |
+| ------ | ---------- | ------------------- | ----------------------------------------------------- |
+| dBASE  | `.dbf`     | `application/x-dbf` | Table data extraction as markdown, field type support |
+
+### Hangul
+
+| Format                | Extensions      | MIME Type                                       | Capabilities                            |
+| --------------------- | --------------- | ----------------------------------------------- | --------------------------------------- |
+| Hangul Word Processor | `.hwp`, `.hwpx` | `application/x-hwp`, `application/haansofthwpx` | Korean document format, text extraction |
+
+## Images (OCR-Enabled)
+
+### Raster Images
+
+| Format | Extensions      | MIME Type    | Capabilities                                                                 |
+| ------ | --------------- | ------------ | ---------------------------------------------------------------------------- |
+| PNG    | `.png`          | `image/png`  | OCR text extraction, table detection, EXIF metadata, dimensions, color space |
+| JPEG   | `.jpg`, `.jpeg` | `image/jpeg` | OCR text extraction, table detection, EXIF metadata, color profile           |
+| GIF    | `.gif`          | `image/gif`  | OCR text extraction, animation metadata, dimensions                          |
+| WebP   | `.webp`         | `image/webp` | OCR text extraction, metadata, lossy/lossless detection                      |
+| Bitmap | `.bmp`          | `image/bmp`  | OCR text extraction, dimensions, color depth                                 |
+| TIFF   | `.tiff`, `.tif` | `image/tiff` | OCR text extraction, multi-page support, EXIF metadata, compression info     |
+
+### Advanced Image Formats
+
+| Format             | Extensions                     | MIME Type                 | Capabilities                                                                     |
+| ------------------ | ------------------------------ | ------------------------- | -------------------------------------------------------------------------------- |
+| JPEG 2000          | `.jp2`                         | `image/jp2`               | OCR via pure Rust decoder (hayro-jpeg2000), table detection, resolution metadata |
+| JPEG 2000 Extended | `.jpx`                         | `image/jpx`               | Advanced JPEG 2000 features, high-resolution content, metadata                   |
+| JPEG 2000 Compound | `.jpm`                         | `image/jpm`               | Compound image support, mixed content                                            |
+| Motion JPEG 2000   | `.mj2`                         | `video/mj2`               | JPEG 2000 video/sequence metadata                                                |
+| JBIG2              | `.jbig2`, `.jb2`               | `image/jbig2`             | Bi-level image OCR, high compression, technical documents                        |
+| Portable PixMap    | `.pnm`, `.pbm`, `.pgm`, `.ppm` | `image/x-portable-pixmap` | OCR for plain image formats, raw pixel data                                      |
+
+### Vector Graphics
+
+| Format                   | Extensions | MIME Type       | Capabilities                                                              |
+| ------------------------ | ---------- | --------------- | ------------------------------------------------------------------------- |
+| Scalable Vector Graphics | `.svg`     | `image/svg+xml` | DOM parsing, embedded text extraction, graphics metadata, vector elements |
+
+## Web & Data
+
+### Markup & Structured Text
+
+| Format           | Extensions      | MIME Type               | Capabilities                                                                       |
+| ---------------- | --------------- | ----------------------- | ---------------------------------------------------------------------------------- |
+| HyperText Markup | `.html`, `.htm` | `text/html`             | DOM parsing, text extraction, metadata (Open Graph, Twitter Card), link extraction |
+| XHTML            | `.xhtml`        | `application/xhtml+xml` | XHTML parsing, metadata extraction, semantic structure                             |
+| XML              | `.xml`          | `application/xml`       | DOM parsing, namespace handling, text extraction, structure analysis               |
+
+### Structured Data Formats
+
+| Format | Extensions      | MIME Type                   | Capabilities                                               |
+| ------ | --------------- | --------------------------- | ---------------------------------------------------------- |
+| JSON   | `.json`         | `application/json`          | Schema detection, nested structure parsing, validation     |
+| YAML   | `.yaml`, `.yml` | `application/x-yaml`        | Hierarchical data parsing, custom tags, nested structures  |
+| TOML   | `.toml`         | `application/toml`          | Configuration parsing, table structures, type preservation |
+| CSV    | `.csv`          | `text/csv`                  | Delimiter detection, header inference, type detection      |
+| TSV    | `.tsv`          | `text/tab-separated-values` | Tab-separated value parsing, header detection              |
+
+### Text & Markup Languages
+
+| Format           | Extensions         | MIME Type         | Capabilities                                      |
+| ---------------- | ------------------ | ----------------- | ------------------------------------------------- |
+| Plain Text       | `.txt`             | `text/plain`      | Raw text extraction, encoding detection           |
+| Markdown         | `.md`, `.markdown` | `text/markdown`   | CommonMark parsing, GFM extensions, front matter  |
+| Djot             | `.djot`            | `text/djot`       | Djot format parsing, semantic structure           |
+| reStructuredText | `.rst`             | `text/x-rst`      | RST parsing, directive handling, role extraction  |
+| Org Mode         | `.org`             | `text/org`        | Org mode structure, outline parsing, metadata     |
+| Rich Text Format | `.rtf`             | `application/rtf` | Text with formatting extraction, font information |
+
+## Email & Archives
+
+### Email Formats
+
+| Format            | Extensions | MIME Type                    | Capabilities                                                                           |
+| ----------------- | ---------- | ---------------------------- | -------------------------------------------------------------------------------------- |
+| Email Message     | `.eml`     | `message/rfc822`             | Headers (from, to, subject, date), body (HTML/plain text), attachments, threading info |
+| Microsoft Outlook | `.msg`     | `application/vnd.ms-outlook` | Outlook headers, body content, attachments, recipient metadata                         |
+
+### Archive Formats
+
+| Format      | Extensions | MIME Type                     | Capabilities                                               |
+| ----------- | ---------- | ----------------------------- | ---------------------------------------------------------- |
+| ZIP Archive | `.zip`     | `application/zip`             | File listing, nested archive support, compression metadata |
+| Tar Archive | `.tar`     | `application/x-tar`           | File listing, permission metadata, nested archives         |
+| Gzip Tar    | `.tgz`     | `application/gzip`            | Compressed archive listing, metadata                       |
+| Gzip        | `.gz`      | `application/gzip`            | Compressed file metadata                                   |
+| 7-Zip       | `.7z`      | `application/x-7z-compressed` | File listing, compression info, nested archives            |
+
+## Academic & Scientific
+
+### Citation Formats
+
+| Format                  | Extensions  | MIME Type                                | Capabilities                                      |
+| ----------------------- | ----------- | ---------------------------------------- | ------------------------------------------------- |
+| BibTeX                  | `.bib`      | `text/bibtex`                            | Structured parsing, entry types, field extraction |
+| BibLaTeX                | `.biblatex` | `text/bibtex`                            | Extended BibTeX format, advanced field support    |
+| RIS                     | `.ris`      | `application/x-research-info-systems`    | Structured RIS format parsing, type detection     |
+| NIH RIS                 | `.nbib`     | `application/x-research-info-systems`    | NIH/PubMed format, structured citation data       |
+| EndNote                 | `.enw`      | `application/x-endnote`                  | EndNote XML format, citation metadata             |
+| Citation Style Language | `.csl`      | `application/vnd.citationstyles.csl+xml` | CSL JSON/XML parsing, style definitions           |
+
+### Scientific & Technical Formats
+
+| Format           | Extensions       | MIME Type                  | Capabilities                                                |
+| ---------------- | ---------------- | -------------------------- | ----------------------------------------------------------- |
+| LaTeX            | `.tex`, `.latex` | `application/x-latex`      | LaTeX source parsing, commands, document structure          |
+| Typst            | `.typ`           | `text/plain`               | Typst markup parsing, document structure                    |
+| JATS XML         | `.jats`          | `application/xml`          | PubMed JATS parsing, article structure, metadata            |
+| Jupyter Notebook | `.ipynb`         | `application/x-ipynb+json` | Cell extraction (code + markdown), output parsing, metadata |
+| DocBook          | `.docbook`       | `application/docbook+xml`  | DocBook XML parsing, semantic structure                     |
+
+### Documentation Formats
+
+| Format      | Extensions | MIME Type                | Capabilities                                    |
+| ----------- | ---------- | ------------------------ | ----------------------------------------------- |
+| OPML        | `.opml`    | `application/x-opml+xml` | Outline parsing, hierarchy extraction, metadata |
+| Perl POD    | `.pod`     | `text/x-pod`             | Perl documentation parsing, section extraction  |
+| Manual Page | `.mdoc`    | `text/plain`             | UNIX manual page parsing, section structure     |
+| Troff/Groff | `.troff`   | `text/troff`             | Typesetting markup parsing, document structure  |
+
+## Format Capabilities Summary
+
+### Text Extraction
+
+All 85+ formats support full or partial text extraction. Document structure and encoding are automatically detected.
+
+### Metadata Support
+
+Comprehensive metadata extraction includes:
+
+- Document properties (title, author, subject, creation date, modification date)
+- Format-specific metadata (page count, dimensions, encoding, language)
+- EXIF data (for images)
+- Document statistics (word count, character count)
+
+### OCR (Optical Character Recognition)
+
+OCR is available for image formats:
+
+- **Raster Images**: PNG, JPEG, GIF, WebP, BMP, TIFF
+- **Advanced Formats**: JPEG 2000, JBIG2, PNM/PBM/PGM/PPM
+- **Configurable Backends**: Tesseract (all languages), EasyOCR, PaddleOCR (Python), Guten (Node.js)
+
+### Table Detection
+
+Smart table detection and reconstruction available for:
+
+- PDF documents (native tables and scanned content with OCR)
+- Office documents (Excel, Word)
+- Images (via OCR backends)
+- HTML/XML (from markup structure)
+
+### Archive & Nested Document Support
+
+Archives and nested formats support file listing and sequential extraction:
+
+- ZIP, TAR, TGZ, 7Z archives
+- Email attachments
+- Nested archives within archives
+
+## Getting Started
+
+For language-specific examples and detailed API documentation, see the [API Reference](https://docs.kreuzberg.dev/reference/api-python/).
+
+For OCR configuration and backend selection, see the [OCR Backends Guide](https://docs.kreuzberg.dev/guides/ocr/).
+
+For comprehensive format details and format detection, see the [Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/).