hjess/fil

Fork 0

Files

Henrik Jess Nielsen b4c07d3693

Deploy fil (kreuzberg) / deploy (push) Successful in 49s

Details

Nomad changes

2026-06-01 23:40:55 +02:00

28 KiB

Raw Permalink Blame History

Advanced Features Reference

Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems.

Plugin System

The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results.

Custom Post-Processors

Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged).

=== "Python"

```python
from kreuzberg import register_post_processor, ExtractionResult

class MetadataEnricher:
    def name(self) -> str:
        return "metadata_enricher"

    def process(self, result: ExtractionResult) -> ExtractionResult:
        result.metadata["processed_by"] = "metadata_enricher"
        result.metadata["char_count"] = len(result.content)
        return result

    def processing_stage(self) -> str:
        # "early", "middle", or "late"
        return "middle"

    def initialize(self) -> None:
        print("Initializing metadata enricher")

    def shutdown(self) -> None:
        print("Shutting down metadata enricher")

register_post_processor(MetadataEnricher())

# Now use extraction with the registered processor
from kreuzberg import extract_file_sync
result = extract_file_sync("document.pdf")
print(result.metadata["char_count"])
```

=== "TypeScript"

```typescript
import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node';

const enricher = {
    name(): string {
        return "metadata_enricher";
    },

    async process(result: ExtractionResult): Promise<ExtractionResult> {
        result.metadata.processed_by = "metadata_enricher";
        result.metadata.char_count = result.content.length;
        return result;
    },

    processingStage?(): "early" | "middle" | "late" {
        return "middle";
    },

    async initialize?(): Promise<void> {
        console.log("Initializing metadata enricher");
    },

    async shutdown?(): Promise<void> {
        console.log("Shutting down metadata enricher");
    }
};

registerPostProcessor(enricher);

// Now use extraction with the registered processor
const result = await extractFile("document.pdf");
console.log(result.metadata.char_count);
```

Custom Validators

Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards.

=== "Python"

```python
from kreuzberg import register_validator, ExtractionResult, ValidationError

class MinimumContentValidator:
    def name(self) -> str:
        return "min_content_validator"

    def validate(self, result: ExtractionResult) -> None:
        if len(result.content) < 100:
            raise ValidationError("Extracted content too short (< 100 chars)")

    def priority(self) -> int:
        # Higher priority runs first (0-1000, default 50)
        return 100

    def should_validate(self, result: ExtractionResult) -> bool:
        # Only validate PDFs
        return "pdf" in result.mime_type.lower()

    def initialize(self) -> None:
        pass

    def shutdown(self) -> None:
        pass

register_validator(MinimumContentValidator())

# Extraction will fail if content < 100 chars
result = extract_file_sync("document.pdf")
```

=== "TypeScript"

```typescript
import { registerValidator, ExtractionResult } from '@kreuzberg/node';

const validator = {
    name(): string {
        return "min_content_validator";
    },

    async validate(result: ExtractionResult): Promise<void> {
        if (result.content.length < 100) {
            throw new Error("Extracted content too short (< 100 chars)");
        }
    },

    priority?(): number {
        return 100;
    },

    shouldValidate?(result: ExtractionResult): boolean {
        return result.mimeType.toLowerCase().includes("pdf");
    },

    async initialize?(): Promise<void> {},

    async shutdown?(): Promise<void> {}
};

registerValidator(validator);

// Extraction will fail if content < 100 chars
const result = await extractFile("document.pdf");
```

Custom OCR Backends

Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions.

=== "Python"

```python
from kreuzberg import register_ocr_backend

class CustomOcrBackend:
    def name(self) -> str:
        return "custom_ocr"

    def supported_languages(self) -> list[str]:
        return ["eng", "deu", "fra", "spa"]

    def process_image(self, image_bytes: bytes, language: str) -> dict:
        # image_bytes: raw image data
        # language: ISO 639-3 code (e.g., "eng", "deu")

        # Call your OCR engine here
        # text = my_ocr_engine.recognize(image_bytes, language)

        return {
            "content": "Extracted text from image",
            "metadata": {"confidence": 0.95, "language": language},
            "tables": []
        }

    def process_file(self, path: str, language: str) -> dict:
        # Optional: custom file processing
        # Called when extracting OCR from a file path
        with open(path, "rb") as f:
            image_bytes = f.read()
        return self.process_image(image_bytes, language)

    def initialize(self) -> None:
        # Load models, initialize engine
        pass

    def shutdown(self) -> None:
        # Clean up resources
        pass

    def version(self) -> str:
        return "1.0.0"

register_ocr_backend(CustomOcrBackend())

# Use in extraction config
from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync

config = ExtractionConfig(
    ocr=OcrConfig(backend="custom_ocr", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
```

=== "TypeScript"

```typescript
import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node';

const backend = {
    name(): string {
        return "custom_ocr";
    },

    supportedLanguages(): string[] {
        return ["eng", "deu", "fra", "spa"];
    },

    async processImage(
        imageBytes: Uint8Array | string,
        language: string
    ): Promise<{
        content: string;
        mime_type: string;
        metadata: Record<string, unknown>;
        tables: unknown[];
    }> {
        const buffer = typeof imageBytes === "string"
            ? Buffer.from(imageBytes, "base64")
            : Buffer.from(imageBytes);

        // Call your OCR engine
        // const text = await myOcrEngine.recognize(buffer, language);

        return {
            content: "Extracted text from image",
            mime_type: "text/plain",
            metadata: { confidence: 0.95, language },
            tables: []
        };
    },

    async initialize?(): Promise<void> {
        // Load models, initialize engine
    },

    async shutdown?(): Promise<void> {
        // Clean up resources
    }
};

registerOcrBackend(backend);

// Use in extraction config
const config: ExtractionConfig = {
    ocr: { backend: "custom_ocr", language: "eng" }
};
const result = await extractFile("scanned.pdf", null, config);
```

Per-File Configuration in Batch Operations

Use FileExtractionConfig to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings.

=== "Python"

```python
from kreuzberg import (
    batch_extract_files_sync,
    ExtractionConfig, FileExtractionConfig, OcrConfig,
)

config = ExtractionConfig(output_format="markdown")
paths = ["report.pdf", "scan.tiff"]
file_configs = [
    None,  # use batch defaults
    FileExtractionConfig(
        force_ocr=True,
        ocr=OcrConfig(backend="tesseract", language="deu"),
    ),
]
results = batch_extract_files_sync(paths, config, file_configs=file_configs)
```

=== "TypeScript"

```typescript
import { batchExtractFilesSync } from '@kreuzberg/node';

const results = batchExtractFilesSync(
  ['report.pdf', 'scan.tiff'],
  { outputFormat: 'markdown' },
  [null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }],
);
```

All ExtractionConfig fields except batch-level concerns (max_concurrent_extractions, use_cache, acceleration, security_limits) can be overridden. None/null fields inherit from the batch default.

Embeddings

Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content.

Requirements: ONNX Runtime 1.22.x or later

=== "Python"

```python
from kreuzberg import (
    ExtractionConfig, ChunkingConfig, EmbeddingConfig,
    EmbeddingModelType, list_embedding_presets,
    get_embedding_preset, extract_file_sync
)

# List available embedding presets
presets = list_embedding_presets()
print(f"Available presets: {presets}")  # ['balanced', 'compact', 'large']

# Get details about a preset
preset_info = get_embedding_preset("balanced")
print(f"Model: {preset_info.model_name}")
print(f"Dimensions: {preset_info.dimensions}")
print(f"Recommended chunk size: {preset_info.chunk_size}")

# Method 1: Use preset (recommended)
config = ExtractionConfig(
    chunking=ChunkingConfig(
        max_chars=512,
        max_overlap=100,
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.preset("balanced"),
            normalize=True,
            batch_size=32
        )
    )
)

# Method 2: Use specific fastembed model
config = ExtractionConfig(
    chunking=ChunkingConfig(
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.fastembed(
                model="BAAI/bge-small-en-v1.5",
                dimensions=384
            ),
            normalize=True
        )
    )
)

# Method 3: Use custom ONNX model from HuggingFace
config = ExtractionConfig(
    chunking=ChunkingConfig(
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.custom(
                model_id="sentence-transformers/all-MiniLM-L6-v2",
                dimensions=384
            ),
            cache_dir="/path/to/model/cache"
        )
    )
)

result = extract_file_sync("document.pdf", config=config)

# Access embeddings in chunks
for chunk in result.chunks:
    embedding = chunk.embedding  # list[float] or None
    print(f"Chunk: {chunk.content[:50]}...")
    print(f"Embedding dimensions: {len(embedding) if embedding else 0}")
```

=== "TypeScript"

```typescript
import {
    ExtractionConfig, ChunkingConfig,
    listEmbeddingPresets, getEmbeddingPreset,
    extractFile
} from '@kreuzberg/node';

// List available embedding presets
const presets = listEmbeddingPresets();
console.log(`Available presets: ${presets}`);  // ['balanced', 'compact', 'large']

// Get details about a preset
const preset = getEmbeddingPreset("balanced");
console.log(`Model: ${preset.modelName}`);
console.log(`Dimensions: ${preset.dimensions}`);
console.log(`Recommended chunk size: ${preset.chunkSize}`);

// Method 1: Use preset (recommended)
const config: ExtractionConfig = {
    chunking: {
        maxChars: 512,
        maxOverlap: 100,
        embedding: {
            model: { type: 'preset', name: 'balanced' },
            normalize: true,
            batchSize: 32
        }
    }
};

// Method 2: Use specific fastembed model
const config2: ExtractionConfig = {
    chunking: {
        embedding: {
            model: {
                type: 'fastembed',
                model: 'BAAI/bge-small-en-v1.5',
                dimensions: 384
            },
            normalize: true
        }
    }
};

// Method 3: Use custom ONNX model
const config3: ExtractionConfig = {
    chunking: {
        embedding: {
            model: {
                type: 'custom',
                modelId: 'sentence-transformers/all-MiniLM-L6-v2',
                dimensions: 384
            },
            cacheDir: '/path/to/model/cache'
        }
    }
};

const result = await extractFile("document.pdf", null, config);

// Access embeddings in chunks
if (result.chunks) {
    for (const chunk of result.chunks) {
        const embedding = chunk.embedding;  // number[] | null
        console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
        console.log(`Embedding dimensions: ${embedding?.length ?? 0}`);
    }
}
```

Keyword Extraction

Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms.

=== "Python"

```python
from kreuzberg import (
    ExtractionConfig, KeywordConfig, KeywordAlgorithm,
    YakeParams, RakeParams, extract_file_sync
)

# YAKE algorithm (unsupervised, good for general use)
config = ExtractionConfig(
    keywords=KeywordConfig(
        algorithm=KeywordAlgorithm.Yake,
        max_keywords=15,
        min_score=0.1,
        ngram_range=(1, 3),
        language="en",
        yake_params=YakeParams(window_size=2)
    )
)

# RAKE algorithm (co-occurrence based)
config = ExtractionConfig(
    keywords=KeywordConfig(
        algorithm=KeywordAlgorithm.Rake,
        max_keywords=10,
        min_score=0.0,
        language="en",
        rake_params=RakeParams(
            min_word_length=3,
            max_words_per_phrase=3
        )
    )
)

result = extract_file_sync("document.pdf", config=config)

# Access extracted keywords
if result.keywords:
    for keyword in result.keywords:
        print(f"Text: {keyword.text}")
        print(f"Score: {keyword.score}")
        print(f"Algorithm: {keyword.algorithm}")
```

=== "TypeScript"

```typescript
import {
    ExtractionConfig, KeywordConfig,
    extractFile
} from '@kreuzberg/node';

// YAKE algorithm
const config: ExtractionConfig = {
    keywords: {
        algorithm: "yake",
        maxKeywords: 15,
        minScore: 0.1,
        ngramRange: [1, 3],
        language: "en",
        yakeParams: {
            windowSize: 2
        }
    }
};

// RAKE algorithm
const config2: ExtractionConfig = {
    keywords: {
        algorithm: "rake",
        maxKeywords: 10,
        minScore: 0.0,
        language: "en",
        rakeParams: {
            minWordLength: 3,
            maxWordsPerPhrase: 3
        }
    }
};

const result = await extractFile("document.pdf", null, config);

// Access extracted keywords
if (result.keywords) {
    for (const keyword of result.keywords) {
        console.log(`Text: ${keyword.text}`);
        console.log(`Score: ${keyword.score}`);
        console.log(`Algorithm: ${keyword.algorithm}`);
    }
}
```

Language Detection

Automatically detect the language(s) in documents using ISO 639-1 language codes.

=== "Python"

```python
from kreuzberg import (
    ExtractionConfig, LanguageDetectionConfig,
    extract_file_sync
)

# Enable language detection
config = ExtractionConfig(
    language_detection=LanguageDetectionConfig(
        enabled=True,
        min_confidence=0.8,
        detect_multiple=False
    )
)

result = extract_file_sync("multilingual.pdf", config=config)

# Access detected languages
if result.detected_languages:
    for lang_code in result.detected_languages:
        print(f"Detected language: {lang_code}")  # e.g., "en", "de", "fr"
```

=== "TypeScript"

```typescript
import {
    ExtractionConfig, LanguageDetectionConfig,
    extractFile
} from '@kreuzberg/node';

const config: ExtractionConfig = {
    languageDetection: {
        enabled: true,
        minConfidence: 0.8,
        detectMultiple: false
    }
};

const result = await extractFile("multilingual.pdf", null, config);

// Access detected languages
if (result.detectedLanguages) {
    for (const langCode of result.detectedLanguages) {
        console.log(`Detected language: ${langCode}`);  // e.g., "en", "de", "fr"
    }
}
```

Token Reduction

Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information.

=== "Python"

```python
from kreuzberg import (
    ExtractionConfig, TokenReductionConfig,
    extract_file_sync
)

# Light token reduction
config = ExtractionConfig(
    token_reduction=TokenReductionConfig(
        mode="light",
        preserve_important_words=True
    )
)

# Moderate reduction
config = ExtractionConfig(
    token_reduction=TokenReductionConfig(
        mode="moderate",
        preserve_important_words=True
    )
)

# Aggressive reduction
config = ExtractionConfig(
    token_reduction=TokenReductionConfig(
        mode="aggressive",
        preserve_important_words=True
    )
)

# Maximum reduction
config = ExtractionConfig(
    token_reduction=TokenReductionConfig(
        mode="maximum",
        preserve_important_words=True
    )
)

result = extract_file_sync("document.pdf", config=config)
print(f"Reduced content length: {len(result.content)}")
```

=== "TypeScript"

```typescript
import {
    ExtractionConfig, TokenReductionConfig,
    extractFile
} from '@kreuzberg/node';

const config: ExtractionConfig = {
    tokenReduction: {
        mode: "moderate",
        preserveImportantWords: true
    }
};

const result = await extractFile("document.pdf", null, config);
console.log(`Reduced content length: ${result.content.length}`);
```

Token Reduction Modes:

off: No reduction (default)
light: Remove extra whitespace and redundant punctuation
moderate: Also remove common filler words and some formatting
aggressive: Also remove longer stopwords and collapse similar phrases
maximum: Maximum reduction while preserving semantic content

Page Extraction

Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity.

=== "Python"

```python
from kreuzberg import (
    ExtractionConfig, PageConfig,
    extract_file_sync
)

config = ExtractionConfig(
    pages=PageConfig(
        extract_pages=True,
        insert_page_markers=True,
        marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
    )
)

result = extract_file_sync("multi_page.pdf", config=config)

# Access per-page content
if result.pages:
    for page in result.pages:
        print(f"Page {page.page_number}:")
        print(f"Content: {page.content[:100]}...")
        print(f"Tables: {len(page.tables)}")
        print(f"Images: {len(page.images)}")
```

=== "TypeScript"

```typescript
import {
    ExtractionConfig, PageExtractionConfig,
    extractFile
} from '@kreuzberg/node';

const config: ExtractionConfig = {
    pages: {
        extractPages: true,
        insertPageMarkers: true,
        markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
    }
};

const result = await extractFile("multi_page.pdf", null, config);

// Access per-page content
if (result.pages) {
    for (const page of result.pages) {
        console.log(`Page ${page.pageNumber}:`);
        console.log(`Content: ${page.content.substring(0, 100)}...`);
        console.log(`Tables: ${page.tables.length}`);
        console.log(`Images: ${page.images.length}`);
    }
}
```

Element-Based Output

Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.).

=== "Python"

```python
from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync

config = ExtractionConfig(
    result_format="element_based"
)

result = extract_file_sync("document.pdf", config=config)

# Access semantic elements
if result.elements:
    for element in result.elements:
        print(f"Type: {element.element_type}")  # title, heading, narrative_text, etc.
        print(f"Text: {element.text}")
        if element.metadata.get("page_number"):
            print(f"Page: {element.metadata['page_number']}")
```

=== "TypeScript"

```typescript
import { ExtractionConfig, extractFile } from '@kreuzberg/node';

const config: ExtractionConfig = {
    resultFormat: "element_based"
};

const result = await extractFile("document.pdf", null, config);

// Access semantic elements
if (result.elements) {
    for (const element of result.elements) {
        console.log(`Type: ${element.elementType}`);
        console.log(`Text: ${element.text}`);
        if (element.metadata.pageNumber) {
            console.log(`Page: ${element.metadata.pageNumber}`);
        }
    }
}
```

Element Types:

title: Document or section title
heading: Section headings
narrative_text: Regular paragraph text
list_item: Items in bullet/numbered lists
table: Table structures
image: Images or figures
page_break: Page boundaries
code_block: Code snippets
block_quote: Quoted text
footer: Footer content
header: Header content

Djot Content

Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure).

=== "Python"

```python
from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync

config = ExtractionConfig(
    output_format="djot"
)

result = extract_file_sync("document.pdf", config=config)
print(result.content)  # Djot-formatted content

# Access structured Djot content
if result.djot_content:
    print(f"Plain text: {result.djot_content['plain_text']}")
    print(f"Blocks: {result.djot_content['blocks']}")
    print(f"Links: {result.djot_content['links']}")
    print(f"Images: {result.djot_content['images']}")
    print(f"Footnotes: {result.djot_content['footnotes']}")
```

=== "TypeScript"

```typescript
import { ExtractionConfig, extractFile } from '@kreuzberg/node';

const config: ExtractionConfig = {
    outputFormat: "djot"
};

const result = await extractFile("document.pdf", null, config);
console.log(result.content);  // Djot-formatted content

// Access structured Djot content (if available)
if (result.djotContent) {
    console.log(`Plain text: ${result.djotContent.plain_text}`);
    console.log(`Blocks: ${result.djotContent.blocks}`);
    console.log(`Links: ${result.djotContent.links}`);
    console.log(`Images: ${result.djotContent.images}`);
    console.log(`Footnotes: ${result.djotContent.footnotes}`);
}
```

API Server

Run Kreuzberg as an HTTP API server for integration with external services.

# Start server on default port 8000
kreuzberg serve

# Custom host and port
kreuzberg serve --host 0.0.0.0 --port 9000

# Enable CORS and other options
kreuzberg serve --host localhost --port 8000

API Endpoints:

POST /extract - Extract from uploaded file
POST /batch - Batch extraction
POST /detect - Detect MIME type

Example:

curl -X POST -F "file=@document.pdf" http://localhost:8000/extract

MCP Server

Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models.

# Start MCP server with stdio transport
kreuzberg mcp --transport stdio

# Start MCP server with HTTP transport
kreuzberg mcp --transport http --host 127.0.0.1 --port 8001

The MCP server exposes extraction functions to AI models, allowing them to process documents directly.

Security Limits

Set resource limits to prevent abuse and control memory/file size consumption.

=== "Python"

```python
from kreuzberg import ExtractionConfig, extract_file_sync

config = ExtractionConfig(
    security_limits={
        "max_file_size": 100_000_000,      # 100 MB
        "max_archive_files": 1000,
        "max_text_length": 10_000_000,     # 10 MB of text
        "max_pages": 10000,
        "max_concurrent_extractions": 4
    }
)

result = extract_file_sync("document.pdf", config=config)
```

=== "TypeScript"

```typescript
import { ExtractionConfig, extractFile } from '@kreuzberg/node';

const config: ExtractionConfig = {
    securityLimits: {
        max_file_size: 100_000_000,        // 100 MB
        max_archive_files: 1000,
        max_text_length: 10_000_000,       // 10 MB of text
        max_pages: 10000,
        max_concurrent_extractions: 4
    }
};

const result = await extractFile("document.pdf", null, config);
```

Common Limits:

max_file_size: Maximum input file size in bytes
max_archive_files: Maximum files in archives (zip, tar, etc.)
max_text_length: Maximum extracted text length
max_pages: Maximum number of pages to process
max_concurrent_extractions: Maximum concurrent extraction operations

Caching

Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration.

=== "Python"

```python
from kreuzberg import ExtractionConfig, extract_file_sync

# Enable caching (default)
config = ExtractionConfig(use_cache=True)
result = extract_file_sync("document.pdf", config=config)

# Disable caching for a specific extraction
config = ExtractionConfig(use_cache=False)
result = extract_file_sync("document.pdf", config=config)
```

=== "TypeScript"

```typescript
import { ExtractionConfig, extractFile } from '@kreuzberg/node';

// Enable caching (default)
const config: ExtractionConfig = { useCache: true };
const result = await extractFile("document.pdf", null, config);

// Disable caching
const config2: ExtractionConfig = { useCache: false };
const result2 = await extractFile("document.pdf", null, config2);
```

CLI Cache Management:

# View cache statistics
kreuzberg cache stats

# Clear all cached results
kreuzberg cache clear

Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.

28 KiB Raw Permalink Blame History

Advanced Features Reference

Plugin System

Custom Post-Processors

Custom Validators

Custom OCR Backends

Per-File Configuration in Batch Operations

Embeddings

Keyword Extraction

Language Detection

Token Reduction

Page Extraction

Element-Based Output

Djot Content

API Server

MCP Server

Security Limits

Caching

28 KiB

Raw Permalink Blame History