# Advanced Features Reference Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems. ## Plugin System The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results. ### Custom Post-Processors Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged). === "Python" ```python from kreuzberg import register_post_processor, ExtractionResult class MetadataEnricher: def name(self) -> str: return "metadata_enricher" def process(self, result: ExtractionResult) -> ExtractionResult: result.metadata["processed_by"] = "metadata_enricher" result.metadata["char_count"] = len(result.content) return result def processing_stage(self) -> str: # "early", "middle", or "late" return "middle" def initialize(self) -> None: print("Initializing metadata enricher") def shutdown(self) -> None: print("Shutting down metadata enricher") register_post_processor(MetadataEnricher()) # Now use extraction with the registered processor from kreuzberg import extract_file_sync result = extract_file_sync("document.pdf") print(result.metadata["char_count"]) ``` === "TypeScript" ```typescript import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node'; const enricher = { name(): string { return "metadata_enricher"; }, async process(result: ExtractionResult): Promise { result.metadata.processed_by = "metadata_enricher"; result.metadata.char_count = result.content.length; return result; }, processingStage?(): "early" | "middle" | "late" { return "middle"; }, async initialize?(): Promise { console.log("Initializing metadata enricher"); }, async shutdown?(): Promise { console.log("Shutting down metadata enricher"); } }; registerPostProcessor(enricher); // Now use extraction with the registered processor const result = await extractFile("document.pdf"); console.log(result.metadata.char_count); ``` ### Custom Validators Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards. === "Python" ```python from kreuzberg import register_validator, ExtractionResult, ValidationError class MinimumContentValidator: def name(self) -> str: return "min_content_validator" def validate(self, result: ExtractionResult) -> None: if len(result.content) < 100: raise ValidationError("Extracted content too short (< 100 chars)") def priority(self) -> int: # Higher priority runs first (0-1000, default 50) return 100 def should_validate(self, result: ExtractionResult) -> bool: # Only validate PDFs return "pdf" in result.mime_type.lower() def initialize(self) -> None: pass def shutdown(self) -> None: pass register_validator(MinimumContentValidator()) # Extraction will fail if content < 100 chars result = extract_file_sync("document.pdf") ``` === "TypeScript" ```typescript import { registerValidator, ExtractionResult } from '@kreuzberg/node'; const validator = { name(): string { return "min_content_validator"; }, async validate(result: ExtractionResult): Promise { if (result.content.length < 100) { throw new Error("Extracted content too short (< 100 chars)"); } }, priority?(): number { return 100; }, shouldValidate?(result: ExtractionResult): boolean { return result.mimeType.toLowerCase().includes("pdf"); }, async initialize?(): Promise {}, async shutdown?(): Promise {} }; registerValidator(validator); // Extraction will fail if content < 100 chars const result = await extractFile("document.pdf"); ``` ### Custom OCR Backends Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions. === "Python" ```python from kreuzberg import register_ocr_backend class CustomOcrBackend: def name(self) -> str: return "custom_ocr" def supported_languages(self) -> list[str]: return ["eng", "deu", "fra", "spa"] def process_image(self, image_bytes: bytes, language: str) -> dict: # image_bytes: raw image data # language: ISO 639-3 code (e.g., "eng", "deu") # Call your OCR engine here # text = my_ocr_engine.recognize(image_bytes, language) return { "content": "Extracted text from image", "metadata": {"confidence": 0.95, "language": language}, "tables": [] } def process_file(self, path: str, language: str) -> dict: # Optional: custom file processing # Called when extracting OCR from a file path with open(path, "rb") as f: image_bytes = f.read() return self.process_image(image_bytes, language) def initialize(self) -> None: # Load models, initialize engine pass def shutdown(self) -> None: # Clean up resources pass def version(self) -> str: return "1.0.0" register_ocr_backend(CustomOcrBackend()) # Use in extraction config from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync config = ExtractionConfig( ocr=OcrConfig(backend="custom_ocr", language="eng") ) result = extract_file_sync("scanned.pdf", config=config) ``` === "TypeScript" ```typescript import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node'; const backend = { name(): string { return "custom_ocr"; }, supportedLanguages(): string[] { return ["eng", "deu", "fra", "spa"]; }, async processImage( imageBytes: Uint8Array | string, language: string ): Promise<{ content: string; mime_type: string; metadata: Record; tables: unknown[]; }> { const buffer = typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes); // Call your OCR engine // const text = await myOcrEngine.recognize(buffer, language); return { content: "Extracted text from image", mime_type: "text/plain", metadata: { confidence: 0.95, language }, tables: [] }; }, async initialize?(): Promise { // Load models, initialize engine }, async shutdown?(): Promise { // Clean up resources } }; registerOcrBackend(backend); // Use in extraction config const config: ExtractionConfig = { ocr: { backend: "custom_ocr", language: "eng" } }; const result = await extractFile("scanned.pdf", null, config); ``` ## Per-File Configuration in Batch Operations Use `FileExtractionConfig` to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings. === "Python" ```python from kreuzberg import ( batch_extract_files_sync, ExtractionConfig, FileExtractionConfig, OcrConfig, ) config = ExtractionConfig(output_format="markdown") paths = ["report.pdf", "scan.tiff"] file_configs = [ None, # use batch defaults FileExtractionConfig( force_ocr=True, ocr=OcrConfig(backend="tesseract", language="deu"), ), ] results = batch_extract_files_sync(paths, config, file_configs=file_configs) ``` === "TypeScript" ```typescript import { batchExtractFilesSync } from '@kreuzberg/node'; const results = batchExtractFilesSync( ['report.pdf', 'scan.tiff'], { outputFormat: 'markdown' }, [null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }], ); ``` All `ExtractionConfig` fields except batch-level concerns (`max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`) can be overridden. `None`/`null` fields inherit from the batch default. ## Embeddings Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content. **Requirements:** ONNX Runtime 1.22.x or later === "Python" ```python from kreuzberg import ( ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, list_embedding_presets, get_embedding_preset, extract_file_sync ) # List available embedding presets presets = list_embedding_presets() print(f"Available presets: {presets}") # ['balanced', 'compact', 'large'] # Get details about a preset preset_info = get_embedding_preset("balanced") print(f"Model: {preset_info.model_name}") print(f"Dimensions: {preset_info.dimensions}") print(f"Recommended chunk size: {preset_info.chunk_size}") # Method 1: Use preset (recommended) config = ExtractionConfig( chunking=ChunkingConfig( max_chars=512, max_overlap=100, embedding=EmbeddingConfig( model=EmbeddingModelType.preset("balanced"), normalize=True, batch_size=32 ) ) ) # Method 2: Use specific fastembed model config = ExtractionConfig( chunking=ChunkingConfig( embedding=EmbeddingConfig( model=EmbeddingModelType.fastembed( model="BAAI/bge-small-en-v1.5", dimensions=384 ), normalize=True ) ) ) # Method 3: Use custom ONNX model from HuggingFace config = ExtractionConfig( chunking=ChunkingConfig( embedding=EmbeddingConfig( model=EmbeddingModelType.custom( model_id="sentence-transformers/all-MiniLM-L6-v2", dimensions=384 ), cache_dir="/path/to/model/cache" ) ) ) result = extract_file_sync("document.pdf", config=config) # Access embeddings in chunks for chunk in result.chunks: embedding = chunk.embedding # list[float] or None print(f"Chunk: {chunk.content[:50]}...") print(f"Embedding dimensions: {len(embedding) if embedding else 0}") ``` === "TypeScript" ```typescript import { ExtractionConfig, ChunkingConfig, listEmbeddingPresets, getEmbeddingPreset, extractFile } from '@kreuzberg/node'; // List available embedding presets const presets = listEmbeddingPresets(); console.log(`Available presets: ${presets}`); // ['balanced', 'compact', 'large'] // Get details about a preset const preset = getEmbeddingPreset("balanced"); console.log(`Model: ${preset.modelName}`); console.log(`Dimensions: ${preset.dimensions}`); console.log(`Recommended chunk size: ${preset.chunkSize}`); // Method 1: Use preset (recommended) const config: ExtractionConfig = { chunking: { maxChars: 512, maxOverlap: 100, embedding: { model: { type: 'preset', name: 'balanced' }, normalize: true, batchSize: 32 } } }; // Method 2: Use specific fastembed model const config2: ExtractionConfig = { chunking: { embedding: { model: { type: 'fastembed', model: 'BAAI/bge-small-en-v1.5', dimensions: 384 }, normalize: true } } }; // Method 3: Use custom ONNX model const config3: ExtractionConfig = { chunking: { embedding: { model: { type: 'custom', modelId: 'sentence-transformers/all-MiniLM-L6-v2', dimensions: 384 }, cacheDir: '/path/to/model/cache' } } }; const result = await extractFile("document.pdf", null, config); // Access embeddings in chunks if (result.chunks) { for (const chunk of result.chunks) { const embedding = chunk.embedding; // number[] | null console.log(`Chunk: ${chunk.content.substring(0, 50)}...`); console.log(`Embedding dimensions: ${embedding?.length ?? 0}`); } } ``` ## Keyword Extraction Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms. === "Python" ```python from kreuzberg import ( ExtractionConfig, KeywordConfig, KeywordAlgorithm, YakeParams, RakeParams, extract_file_sync ) # YAKE algorithm (unsupervised, good for general use) config = ExtractionConfig( keywords=KeywordConfig( algorithm=KeywordAlgorithm.Yake, max_keywords=15, min_score=0.1, ngram_range=(1, 3), language="en", yake_params=YakeParams(window_size=2) ) ) # RAKE algorithm (co-occurrence based) config = ExtractionConfig( keywords=KeywordConfig( algorithm=KeywordAlgorithm.Rake, max_keywords=10, min_score=0.0, language="en", rake_params=RakeParams( min_word_length=3, max_words_per_phrase=3 ) ) ) result = extract_file_sync("document.pdf", config=config) # Access extracted keywords if result.keywords: for keyword in result.keywords: print(f"Text: {keyword.text}") print(f"Score: {keyword.score}") print(f"Algorithm: {keyword.algorithm}") ``` === "TypeScript" ```typescript import { ExtractionConfig, KeywordConfig, extractFile } from '@kreuzberg/node'; // YAKE algorithm const config: ExtractionConfig = { keywords: { algorithm: "yake", maxKeywords: 15, minScore: 0.1, ngramRange: [1, 3], language: "en", yakeParams: { windowSize: 2 } } }; // RAKE algorithm const config2: ExtractionConfig = { keywords: { algorithm: "rake", maxKeywords: 10, minScore: 0.0, language: "en", rakeParams: { minWordLength: 3, maxWordsPerPhrase: 3 } } }; const result = await extractFile("document.pdf", null, config); // Access extracted keywords if (result.keywords) { for (const keyword of result.keywords) { console.log(`Text: ${keyword.text}`); console.log(`Score: ${keyword.score}`); console.log(`Algorithm: ${keyword.algorithm}`); } } ``` ## Language Detection Automatically detect the language(s) in documents using ISO 639-1 language codes. === "Python" ```python from kreuzberg import ( ExtractionConfig, LanguageDetectionConfig, extract_file_sync ) # Enable language detection config = ExtractionConfig( language_detection=LanguageDetectionConfig( enabled=True, min_confidence=0.8, detect_multiple=False ) ) result = extract_file_sync("multilingual.pdf", config=config) # Access detected languages if result.detected_languages: for lang_code in result.detected_languages: print(f"Detected language: {lang_code}") # e.g., "en", "de", "fr" ``` === "TypeScript" ```typescript import { ExtractionConfig, LanguageDetectionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { languageDetection: { enabled: true, minConfidence: 0.8, detectMultiple: false } }; const result = await extractFile("multilingual.pdf", null, config); // Access detected languages if (result.detectedLanguages) { for (const langCode of result.detectedLanguages) { console.log(`Detected language: ${langCode}`); // e.g., "en", "de", "fr" } } ``` ## Token Reduction Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information. === "Python" ```python from kreuzberg import ( ExtractionConfig, TokenReductionConfig, extract_file_sync ) # Light token reduction config = ExtractionConfig( token_reduction=TokenReductionConfig( mode="light", preserve_important_words=True ) ) # Moderate reduction config = ExtractionConfig( token_reduction=TokenReductionConfig( mode="moderate", preserve_important_words=True ) ) # Aggressive reduction config = ExtractionConfig( token_reduction=TokenReductionConfig( mode="aggressive", preserve_important_words=True ) ) # Maximum reduction config = ExtractionConfig( token_reduction=TokenReductionConfig( mode="maximum", preserve_important_words=True ) ) result = extract_file_sync("document.pdf", config=config) print(f"Reduced content length: {len(result.content)}") ``` === "TypeScript" ```typescript import { ExtractionConfig, TokenReductionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { tokenReduction: { mode: "moderate", preserveImportantWords: true } }; const result = await extractFile("document.pdf", null, config); console.log(`Reduced content length: ${result.content.length}`); ``` **Token Reduction Modes:** - `off`: No reduction (default) - `light`: Remove extra whitespace and redundant punctuation - `moderate`: Also remove common filler words and some formatting - `aggressive`: Also remove longer stopwords and collapse similar phrases - `maximum`: Maximum reduction while preserving semantic content ## Page Extraction Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity. === "Python" ```python from kreuzberg import ( ExtractionConfig, PageConfig, extract_file_sync ) config = ExtractionConfig( pages=PageConfig( extract_pages=True, insert_page_markers=True, marker_format="\n\n\n\n" ) ) result = extract_file_sync("multi_page.pdf", config=config) # Access per-page content if result.pages: for page in result.pages: print(f"Page {page.page_number}:") print(f"Content: {page.content[:100]}...") print(f"Tables: {len(page.tables)}") print(f"Images: {len(page.images)}") ``` === "TypeScript" ```typescript import { ExtractionConfig, PageExtractionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { pages: { extractPages: true, insertPageMarkers: true, markerFormat: "\n\n\n\n" } }; const result = await extractFile("multi_page.pdf", null, config); // Access per-page content if (result.pages) { for (const page of result.pages) { console.log(`Page ${page.pageNumber}:`); console.log(`Content: ${page.content.substring(0, 100)}...`); console.log(`Tables: ${page.tables.length}`); console.log(`Images: ${page.images.length}`); } } ``` ## Element-Based Output Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.). === "Python" ```python from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync config = ExtractionConfig( result_format="element_based" ) result = extract_file_sync("document.pdf", config=config) # Access semantic elements if result.elements: for element in result.elements: print(f"Type: {element.element_type}") # title, heading, narrative_text, etc. print(f"Text: {element.text}") if element.metadata.get("page_number"): print(f"Page: {element.metadata['page_number']}") ``` === "TypeScript" ```typescript import { ExtractionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { resultFormat: "element_based" }; const result = await extractFile("document.pdf", null, config); // Access semantic elements if (result.elements) { for (const element of result.elements) { console.log(`Type: ${element.elementType}`); console.log(`Text: ${element.text}`); if (element.metadata.pageNumber) { console.log(`Page: ${element.metadata.pageNumber}`); } } } ``` **Element Types:** - `title`: Document or section title - `heading`: Section headings - `narrative_text`: Regular paragraph text - `list_item`: Items in bullet/numbered lists - `table`: Table structures - `image`: Images or figures - `page_break`: Page boundaries - `code_block`: Code snippets - `block_quote`: Quoted text - `footer`: Footer content - `header`: Header content ## Djot Content Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure). === "Python" ```python from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync config = ExtractionConfig( output_format="djot" ) result = extract_file_sync("document.pdf", config=config) print(result.content) # Djot-formatted content # Access structured Djot content if result.djot_content: print(f"Plain text: {result.djot_content['plain_text']}") print(f"Blocks: {result.djot_content['blocks']}") print(f"Links: {result.djot_content['links']}") print(f"Images: {result.djot_content['images']}") print(f"Footnotes: {result.djot_content['footnotes']}") ``` === "TypeScript" ```typescript import { ExtractionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { outputFormat: "djot" }; const result = await extractFile("document.pdf", null, config); console.log(result.content); // Djot-formatted content // Access structured Djot content (if available) if (result.djotContent) { console.log(`Plain text: ${result.djotContent.plain_text}`); console.log(`Blocks: ${result.djotContent.blocks}`); console.log(`Links: ${result.djotContent.links}`); console.log(`Images: ${result.djotContent.images}`); console.log(`Footnotes: ${result.djotContent.footnotes}`); } ``` ## API Server Run Kreuzberg as an HTTP API server for integration with external services. ```bash # Start server on default port 8000 kreuzberg serve # Custom host and port kreuzberg serve --host 0.0.0.0 --port 9000 # Enable CORS and other options kreuzberg serve --host localhost --port 8000 ``` **API Endpoints:** - `POST /extract` - Extract from uploaded file - `POST /batch` - Batch extraction - `POST /detect` - Detect MIME type **Example:** ```bash curl -X POST -F "file=@document.pdf" http://localhost:8000/extract ``` ## MCP Server Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models. ```bash # Start MCP server with stdio transport kreuzberg mcp --transport stdio # Start MCP server with HTTP transport kreuzberg mcp --transport http --host 127.0.0.1 --port 8001 ``` The MCP server exposes extraction functions to AI models, allowing them to process documents directly. ## Security Limits Set resource limits to prevent abuse and control memory/file size consumption. === "Python" ```python from kreuzberg import ExtractionConfig, extract_file_sync config = ExtractionConfig( security_limits={ "max_file_size": 100_000_000, # 100 MB "max_archive_files": 1000, "max_text_length": 10_000_000, # 10 MB of text "max_pages": 10000, "max_concurrent_extractions": 4 } ) result = extract_file_sync("document.pdf", config=config) ``` === "TypeScript" ```typescript import { ExtractionConfig, extractFile } from '@kreuzberg/node'; const config: ExtractionConfig = { securityLimits: { max_file_size: 100_000_000, // 100 MB max_archive_files: 1000, max_text_length: 10_000_000, // 10 MB of text max_pages: 10000, max_concurrent_extractions: 4 } }; const result = await extractFile("document.pdf", null, config); ``` **Common Limits:** - `max_file_size`: Maximum input file size in bytes - `max_archive_files`: Maximum files in archives (zip, tar, etc.) - `max_text_length`: Maximum extracted text length - `max_pages`: Maximum number of pages to process - `max_concurrent_extractions`: Maximum concurrent extraction operations ## Caching Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration. === "Python" ```python from kreuzberg import ExtractionConfig, extract_file_sync # Enable caching (default) config = ExtractionConfig(use_cache=True) result = extract_file_sync("document.pdf", config=config) # Disable caching for a specific extraction config = ExtractionConfig(use_cache=False) result = extract_file_sync("document.pdf", config=config) ``` === "TypeScript" ```typescript import { ExtractionConfig, extractFile } from '@kreuzberg/node'; // Enable caching (default) const config: ExtractionConfig = { useCache: true }; const result = await extractFile("document.pdf", null, config); // Disable caching const config2: ExtractionConfig = { useCache: false }; const result2 = await extractFile("document.pdf", null, config2); ``` **CLI Cache Management:** ```bash # View cache statistics kreuzberg cache stats # Clear all cached results kreuzberg cache clear ``` Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.