976 lines
28 KiB
Markdown
976 lines
28 KiB
Markdown
# Advanced Features Reference
|
|
|
|
Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems.
|
|
|
|
## Plugin System
|
|
|
|
The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results.
|
|
|
|
### Custom Post-Processors
|
|
|
|
Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged).
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import register_post_processor, ExtractionResult
|
|
|
|
class MetadataEnricher:
|
|
def name(self) -> str:
|
|
return "metadata_enricher"
|
|
|
|
def process(self, result: ExtractionResult) -> ExtractionResult:
|
|
result.metadata["processed_by"] = "metadata_enricher"
|
|
result.metadata["char_count"] = len(result.content)
|
|
return result
|
|
|
|
def processing_stage(self) -> str:
|
|
# "early", "middle", or "late"
|
|
return "middle"
|
|
|
|
def initialize(self) -> None:
|
|
print("Initializing metadata enricher")
|
|
|
|
def shutdown(self) -> None:
|
|
print("Shutting down metadata enricher")
|
|
|
|
register_post_processor(MetadataEnricher())
|
|
|
|
# Now use extraction with the registered processor
|
|
from kreuzberg import extract_file_sync
|
|
result = extract_file_sync("document.pdf")
|
|
print(result.metadata["char_count"])
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node';
|
|
|
|
const enricher = {
|
|
name(): string {
|
|
return "metadata_enricher";
|
|
},
|
|
|
|
async process(result: ExtractionResult): Promise<ExtractionResult> {
|
|
result.metadata.processed_by = "metadata_enricher";
|
|
result.metadata.char_count = result.content.length;
|
|
return result;
|
|
},
|
|
|
|
processingStage?(): "early" | "middle" | "late" {
|
|
return "middle";
|
|
},
|
|
|
|
async initialize?(): Promise<void> {
|
|
console.log("Initializing metadata enricher");
|
|
},
|
|
|
|
async shutdown?(): Promise<void> {
|
|
console.log("Shutting down metadata enricher");
|
|
}
|
|
};
|
|
|
|
registerPostProcessor(enricher);
|
|
|
|
// Now use extraction with the registered processor
|
|
const result = await extractFile("document.pdf");
|
|
console.log(result.metadata.char_count);
|
|
```
|
|
|
|
### Custom Validators
|
|
|
|
Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import register_validator, ExtractionResult, ValidationError
|
|
|
|
class MinimumContentValidator:
|
|
def name(self) -> str:
|
|
return "min_content_validator"
|
|
|
|
def validate(self, result: ExtractionResult) -> None:
|
|
if len(result.content) < 100:
|
|
raise ValidationError("Extracted content too short (< 100 chars)")
|
|
|
|
def priority(self) -> int:
|
|
# Higher priority runs first (0-1000, default 50)
|
|
return 100
|
|
|
|
def should_validate(self, result: ExtractionResult) -> bool:
|
|
# Only validate PDFs
|
|
return "pdf" in result.mime_type.lower()
|
|
|
|
def initialize(self) -> None:
|
|
pass
|
|
|
|
def shutdown(self) -> None:
|
|
pass
|
|
|
|
register_validator(MinimumContentValidator())
|
|
|
|
# Extraction will fail if content < 100 chars
|
|
result = extract_file_sync("document.pdf")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { registerValidator, ExtractionResult } from '@kreuzberg/node';
|
|
|
|
const validator = {
|
|
name(): string {
|
|
return "min_content_validator";
|
|
},
|
|
|
|
async validate(result: ExtractionResult): Promise<void> {
|
|
if (result.content.length < 100) {
|
|
throw new Error("Extracted content too short (< 100 chars)");
|
|
}
|
|
},
|
|
|
|
priority?(): number {
|
|
return 100;
|
|
},
|
|
|
|
shouldValidate?(result: ExtractionResult): boolean {
|
|
return result.mimeType.toLowerCase().includes("pdf");
|
|
},
|
|
|
|
async initialize?(): Promise<void> {},
|
|
|
|
async shutdown?(): Promise<void> {}
|
|
};
|
|
|
|
registerValidator(validator);
|
|
|
|
// Extraction will fail if content < 100 chars
|
|
const result = await extractFile("document.pdf");
|
|
```
|
|
|
|
### Custom OCR Backends
|
|
|
|
Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import register_ocr_backend
|
|
|
|
class CustomOcrBackend:
|
|
def name(self) -> str:
|
|
return "custom_ocr"
|
|
|
|
def supported_languages(self) -> list[str]:
|
|
return ["eng", "deu", "fra", "spa"]
|
|
|
|
def process_image(self, image_bytes: bytes, language: str) -> dict:
|
|
# image_bytes: raw image data
|
|
# language: ISO 639-3 code (e.g., "eng", "deu")
|
|
|
|
# Call your OCR engine here
|
|
# text = my_ocr_engine.recognize(image_bytes, language)
|
|
|
|
return {
|
|
"content": "Extracted text from image",
|
|
"metadata": {"confidence": 0.95, "language": language},
|
|
"tables": []
|
|
}
|
|
|
|
def process_file(self, path: str, language: str) -> dict:
|
|
# Optional: custom file processing
|
|
# Called when extracting OCR from a file path
|
|
with open(path, "rb") as f:
|
|
image_bytes = f.read()
|
|
return self.process_image(image_bytes, language)
|
|
|
|
def initialize(self) -> None:
|
|
# Load models, initialize engine
|
|
pass
|
|
|
|
def shutdown(self) -> None:
|
|
# Clean up resources
|
|
pass
|
|
|
|
def version(self) -> str:
|
|
return "1.0.0"
|
|
|
|
register_ocr_backend(CustomOcrBackend())
|
|
|
|
# Use in extraction config
|
|
from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
|
|
|
|
config = ExtractionConfig(
|
|
ocr=OcrConfig(backend="custom_ocr", language="eng")
|
|
)
|
|
result = extract_file_sync("scanned.pdf", config=config)
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
|
|
const backend = {
|
|
name(): string {
|
|
return "custom_ocr";
|
|
},
|
|
|
|
supportedLanguages(): string[] {
|
|
return ["eng", "deu", "fra", "spa"];
|
|
},
|
|
|
|
async processImage(
|
|
imageBytes: Uint8Array | string,
|
|
language: string
|
|
): Promise<{
|
|
content: string;
|
|
mime_type: string;
|
|
metadata: Record<string, unknown>;
|
|
tables: unknown[];
|
|
}> {
|
|
const buffer = typeof imageBytes === "string"
|
|
? Buffer.from(imageBytes, "base64")
|
|
: Buffer.from(imageBytes);
|
|
|
|
// Call your OCR engine
|
|
// const text = await myOcrEngine.recognize(buffer, language);
|
|
|
|
return {
|
|
content: "Extracted text from image",
|
|
mime_type: "text/plain",
|
|
metadata: { confidence: 0.95, language },
|
|
tables: []
|
|
};
|
|
},
|
|
|
|
async initialize?(): Promise<void> {
|
|
// Load models, initialize engine
|
|
},
|
|
|
|
async shutdown?(): Promise<void> {
|
|
// Clean up resources
|
|
}
|
|
};
|
|
|
|
registerOcrBackend(backend);
|
|
|
|
// Use in extraction config
|
|
const config: ExtractionConfig = {
|
|
ocr: { backend: "custom_ocr", language: "eng" }
|
|
};
|
|
const result = await extractFile("scanned.pdf", null, config);
|
|
```
|
|
|
|
## Per-File Configuration in Batch Operations
|
|
|
|
Use `FileExtractionConfig` to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
batch_extract_files_sync,
|
|
ExtractionConfig, FileExtractionConfig, OcrConfig,
|
|
)
|
|
|
|
config = ExtractionConfig(output_format="markdown")
|
|
paths = ["report.pdf", "scan.tiff"]
|
|
file_configs = [
|
|
None, # use batch defaults
|
|
FileExtractionConfig(
|
|
force_ocr=True,
|
|
ocr=OcrConfig(backend="tesseract", language="deu"),
|
|
),
|
|
]
|
|
results = batch_extract_files_sync(paths, config, file_configs=file_configs)
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { batchExtractFilesSync } from '@kreuzberg/node';
|
|
|
|
const results = batchExtractFilesSync(
|
|
['report.pdf', 'scan.tiff'],
|
|
{ outputFormat: 'markdown' },
|
|
[null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }],
|
|
);
|
|
```
|
|
|
|
All `ExtractionConfig` fields except batch-level concerns (`max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`) can be overridden. `None`/`null` fields inherit from the batch default.
|
|
|
|
## Embeddings
|
|
|
|
Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content.
|
|
|
|
**Requirements:** ONNX Runtime 1.22.x or later
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
ExtractionConfig, ChunkingConfig, EmbeddingConfig,
|
|
EmbeddingModelType, list_embedding_presets,
|
|
get_embedding_preset, extract_file_sync
|
|
)
|
|
|
|
# List available embedding presets
|
|
presets = list_embedding_presets()
|
|
print(f"Available presets: {presets}") # ['balanced', 'compact', 'large']
|
|
|
|
# Get details about a preset
|
|
preset_info = get_embedding_preset("balanced")
|
|
print(f"Model: {preset_info.model_name}")
|
|
print(f"Dimensions: {preset_info.dimensions}")
|
|
print(f"Recommended chunk size: {preset_info.chunk_size}")
|
|
|
|
# Method 1: Use preset (recommended)
|
|
config = ExtractionConfig(
|
|
chunking=ChunkingConfig(
|
|
max_chars=512,
|
|
max_overlap=100,
|
|
embedding=EmbeddingConfig(
|
|
model=EmbeddingModelType.preset("balanced"),
|
|
normalize=True,
|
|
batch_size=32
|
|
)
|
|
)
|
|
)
|
|
|
|
# Method 2: Use specific fastembed model
|
|
config = ExtractionConfig(
|
|
chunking=ChunkingConfig(
|
|
embedding=EmbeddingConfig(
|
|
model=EmbeddingModelType.fastembed(
|
|
model="BAAI/bge-small-en-v1.5",
|
|
dimensions=384
|
|
),
|
|
normalize=True
|
|
)
|
|
)
|
|
)
|
|
|
|
# Method 3: Use custom ONNX model from HuggingFace
|
|
config = ExtractionConfig(
|
|
chunking=ChunkingConfig(
|
|
embedding=EmbeddingConfig(
|
|
model=EmbeddingModelType.custom(
|
|
model_id="sentence-transformers/all-MiniLM-L6-v2",
|
|
dimensions=384
|
|
),
|
|
cache_dir="/path/to/model/cache"
|
|
)
|
|
)
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
|
|
# Access embeddings in chunks
|
|
for chunk in result.chunks:
|
|
embedding = chunk.embedding # list[float] or None
|
|
print(f"Chunk: {chunk.content[:50]}...")
|
|
print(f"Embedding dimensions: {len(embedding) if embedding else 0}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import {
|
|
ExtractionConfig, ChunkingConfig,
|
|
listEmbeddingPresets, getEmbeddingPreset,
|
|
extractFile
|
|
} from '@kreuzberg/node';
|
|
|
|
// List available embedding presets
|
|
const presets = listEmbeddingPresets();
|
|
console.log(`Available presets: ${presets}`); // ['balanced', 'compact', 'large']
|
|
|
|
// Get details about a preset
|
|
const preset = getEmbeddingPreset("balanced");
|
|
console.log(`Model: ${preset.modelName}`);
|
|
console.log(`Dimensions: ${preset.dimensions}`);
|
|
console.log(`Recommended chunk size: ${preset.chunkSize}`);
|
|
|
|
// Method 1: Use preset (recommended)
|
|
const config: ExtractionConfig = {
|
|
chunking: {
|
|
maxChars: 512,
|
|
maxOverlap: 100,
|
|
embedding: {
|
|
model: { type: 'preset', name: 'balanced' },
|
|
normalize: true,
|
|
batchSize: 32
|
|
}
|
|
}
|
|
};
|
|
|
|
// Method 2: Use specific fastembed model
|
|
const config2: ExtractionConfig = {
|
|
chunking: {
|
|
embedding: {
|
|
model: {
|
|
type: 'fastembed',
|
|
model: 'BAAI/bge-small-en-v1.5',
|
|
dimensions: 384
|
|
},
|
|
normalize: true
|
|
}
|
|
}
|
|
};
|
|
|
|
// Method 3: Use custom ONNX model
|
|
const config3: ExtractionConfig = {
|
|
chunking: {
|
|
embedding: {
|
|
model: {
|
|
type: 'custom',
|
|
modelId: 'sentence-transformers/all-MiniLM-L6-v2',
|
|
dimensions: 384
|
|
},
|
|
cacheDir: '/path/to/model/cache'
|
|
}
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
|
|
// Access embeddings in chunks
|
|
if (result.chunks) {
|
|
for (const chunk of result.chunks) {
|
|
const embedding = chunk.embedding; // number[] | null
|
|
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
|
|
console.log(`Embedding dimensions: ${embedding?.length ?? 0}`);
|
|
}
|
|
}
|
|
```
|
|
|
|
## Keyword Extraction
|
|
|
|
Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
ExtractionConfig, KeywordConfig, KeywordAlgorithm,
|
|
YakeParams, RakeParams, extract_file_sync
|
|
)
|
|
|
|
# YAKE algorithm (unsupervised, good for general use)
|
|
config = ExtractionConfig(
|
|
keywords=KeywordConfig(
|
|
algorithm=KeywordAlgorithm.Yake,
|
|
max_keywords=15,
|
|
min_score=0.1,
|
|
ngram_range=(1, 3),
|
|
language="en",
|
|
yake_params=YakeParams(window_size=2)
|
|
)
|
|
)
|
|
|
|
# RAKE algorithm (co-occurrence based)
|
|
config = ExtractionConfig(
|
|
keywords=KeywordConfig(
|
|
algorithm=KeywordAlgorithm.Rake,
|
|
max_keywords=10,
|
|
min_score=0.0,
|
|
language="en",
|
|
rake_params=RakeParams(
|
|
min_word_length=3,
|
|
max_words_per_phrase=3
|
|
)
|
|
)
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
|
|
# Access extracted keywords
|
|
if result.keywords:
|
|
for keyword in result.keywords:
|
|
print(f"Text: {keyword.text}")
|
|
print(f"Score: {keyword.score}")
|
|
print(f"Algorithm: {keyword.algorithm}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import {
|
|
ExtractionConfig, KeywordConfig,
|
|
extractFile
|
|
} from '@kreuzberg/node';
|
|
|
|
// YAKE algorithm
|
|
const config: ExtractionConfig = {
|
|
keywords: {
|
|
algorithm: "yake",
|
|
maxKeywords: 15,
|
|
minScore: 0.1,
|
|
ngramRange: [1, 3],
|
|
language: "en",
|
|
yakeParams: {
|
|
windowSize: 2
|
|
}
|
|
}
|
|
};
|
|
|
|
// RAKE algorithm
|
|
const config2: ExtractionConfig = {
|
|
keywords: {
|
|
algorithm: "rake",
|
|
maxKeywords: 10,
|
|
minScore: 0.0,
|
|
language: "en",
|
|
rakeParams: {
|
|
minWordLength: 3,
|
|
maxWordsPerPhrase: 3
|
|
}
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
|
|
// Access extracted keywords
|
|
if (result.keywords) {
|
|
for (const keyword of result.keywords) {
|
|
console.log(`Text: ${keyword.text}`);
|
|
console.log(`Score: ${keyword.score}`);
|
|
console.log(`Algorithm: ${keyword.algorithm}`);
|
|
}
|
|
}
|
|
```
|
|
|
|
## Language Detection
|
|
|
|
Automatically detect the language(s) in documents using ISO 639-1 language codes.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
ExtractionConfig, LanguageDetectionConfig,
|
|
extract_file_sync
|
|
)
|
|
|
|
# Enable language detection
|
|
config = ExtractionConfig(
|
|
language_detection=LanguageDetectionConfig(
|
|
enabled=True,
|
|
min_confidence=0.8,
|
|
detect_multiple=False
|
|
)
|
|
)
|
|
|
|
result = extract_file_sync("multilingual.pdf", config=config)
|
|
|
|
# Access detected languages
|
|
if result.detected_languages:
|
|
for lang_code in result.detected_languages:
|
|
print(f"Detected language: {lang_code}") # e.g., "en", "de", "fr"
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import {
|
|
ExtractionConfig, LanguageDetectionConfig,
|
|
extractFile
|
|
} from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
languageDetection: {
|
|
enabled: true,
|
|
minConfidence: 0.8,
|
|
detectMultiple: false
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("multilingual.pdf", null, config);
|
|
|
|
// Access detected languages
|
|
if (result.detectedLanguages) {
|
|
for (const langCode of result.detectedLanguages) {
|
|
console.log(`Detected language: ${langCode}`); // e.g., "en", "de", "fr"
|
|
}
|
|
}
|
|
```
|
|
|
|
## Token Reduction
|
|
|
|
Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
ExtractionConfig, TokenReductionConfig,
|
|
extract_file_sync
|
|
)
|
|
|
|
# Light token reduction
|
|
config = ExtractionConfig(
|
|
token_reduction=TokenReductionConfig(
|
|
mode="light",
|
|
preserve_important_words=True
|
|
)
|
|
)
|
|
|
|
# Moderate reduction
|
|
config = ExtractionConfig(
|
|
token_reduction=TokenReductionConfig(
|
|
mode="moderate",
|
|
preserve_important_words=True
|
|
)
|
|
)
|
|
|
|
# Aggressive reduction
|
|
config = ExtractionConfig(
|
|
token_reduction=TokenReductionConfig(
|
|
mode="aggressive",
|
|
preserve_important_words=True
|
|
)
|
|
)
|
|
|
|
# Maximum reduction
|
|
config = ExtractionConfig(
|
|
token_reduction=TokenReductionConfig(
|
|
mode="maximum",
|
|
preserve_important_words=True
|
|
)
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
print(f"Reduced content length: {len(result.content)}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import {
|
|
ExtractionConfig, TokenReductionConfig,
|
|
extractFile
|
|
} from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
tokenReduction: {
|
|
mode: "moderate",
|
|
preserveImportantWords: true
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
console.log(`Reduced content length: ${result.content.length}`);
|
|
```
|
|
|
|
**Token Reduction Modes:**
|
|
|
|
- `off`: No reduction (default)
|
|
- `light`: Remove extra whitespace and redundant punctuation
|
|
- `moderate`: Also remove common filler words and some formatting
|
|
- `aggressive`: Also remove longer stopwords and collapse similar phrases
|
|
- `maximum`: Maximum reduction while preserving semantic content
|
|
|
|
## Page Extraction
|
|
|
|
Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import (
|
|
ExtractionConfig, PageConfig,
|
|
extract_file_sync
|
|
)
|
|
|
|
config = ExtractionConfig(
|
|
pages=PageConfig(
|
|
extract_pages=True,
|
|
insert_page_markers=True,
|
|
marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
|
|
)
|
|
)
|
|
|
|
result = extract_file_sync("multi_page.pdf", config=config)
|
|
|
|
# Access per-page content
|
|
if result.pages:
|
|
for page in result.pages:
|
|
print(f"Page {page.page_number}:")
|
|
print(f"Content: {page.content[:100]}...")
|
|
print(f"Tables: {len(page.tables)}")
|
|
print(f"Images: {len(page.images)}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import {
|
|
ExtractionConfig, PageExtractionConfig,
|
|
extractFile
|
|
} from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
pages: {
|
|
extractPages: true,
|
|
insertPageMarkers: true,
|
|
markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("multi_page.pdf", null, config);
|
|
|
|
// Access per-page content
|
|
if (result.pages) {
|
|
for (const page of result.pages) {
|
|
console.log(`Page ${page.pageNumber}:`);
|
|
console.log(`Content: ${page.content.substring(0, 100)}...`);
|
|
console.log(`Tables: ${page.tables.length}`);
|
|
console.log(`Images: ${page.images.length}`);
|
|
}
|
|
}
|
|
```
|
|
|
|
## Element-Based Output
|
|
|
|
Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.).
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync
|
|
|
|
config = ExtractionConfig(
|
|
result_format="element_based"
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
|
|
# Access semantic elements
|
|
if result.elements:
|
|
for element in result.elements:
|
|
print(f"Type: {element.element_type}") # title, heading, narrative_text, etc.
|
|
print(f"Text: {element.text}")
|
|
if element.metadata.get("page_number"):
|
|
print(f"Page: {element.metadata['page_number']}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
resultFormat: "element_based"
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
|
|
// Access semantic elements
|
|
if (result.elements) {
|
|
for (const element of result.elements) {
|
|
console.log(`Type: ${element.elementType}`);
|
|
console.log(`Text: ${element.text}`);
|
|
if (element.metadata.pageNumber) {
|
|
console.log(`Page: ${element.metadata.pageNumber}`);
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
**Element Types:**
|
|
|
|
- `title`: Document or section title
|
|
- `heading`: Section headings
|
|
- `narrative_text`: Regular paragraph text
|
|
- `list_item`: Items in bullet/numbered lists
|
|
- `table`: Table structures
|
|
- `image`: Images or figures
|
|
- `page_break`: Page boundaries
|
|
- `code_block`: Code snippets
|
|
- `block_quote`: Quoted text
|
|
- `footer`: Footer content
|
|
- `header`: Header content
|
|
|
|
## Djot Content
|
|
|
|
Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure).
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync
|
|
|
|
config = ExtractionConfig(
|
|
output_format="djot"
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
print(result.content) # Djot-formatted content
|
|
|
|
# Access structured Djot content
|
|
if result.djot_content:
|
|
print(f"Plain text: {result.djot_content['plain_text']}")
|
|
print(f"Blocks: {result.djot_content['blocks']}")
|
|
print(f"Links: {result.djot_content['links']}")
|
|
print(f"Images: {result.djot_content['images']}")
|
|
print(f"Footnotes: {result.djot_content['footnotes']}")
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
outputFormat: "djot"
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
console.log(result.content); // Djot-formatted content
|
|
|
|
// Access structured Djot content (if available)
|
|
if (result.djotContent) {
|
|
console.log(`Plain text: ${result.djotContent.plain_text}`);
|
|
console.log(`Blocks: ${result.djotContent.blocks}`);
|
|
console.log(`Links: ${result.djotContent.links}`);
|
|
console.log(`Images: ${result.djotContent.images}`);
|
|
console.log(`Footnotes: ${result.djotContent.footnotes}`);
|
|
}
|
|
```
|
|
|
|
## API Server
|
|
|
|
Run Kreuzberg as an HTTP API server for integration with external services.
|
|
|
|
```bash
|
|
# Start server on default port 8000
|
|
kreuzberg serve
|
|
|
|
# Custom host and port
|
|
kreuzberg serve --host 0.0.0.0 --port 9000
|
|
|
|
# Enable CORS and other options
|
|
kreuzberg serve --host localhost --port 8000
|
|
```
|
|
|
|
**API Endpoints:**
|
|
|
|
- `POST /extract` - Extract from uploaded file
|
|
- `POST /batch` - Batch extraction
|
|
- `POST /detect` - Detect MIME type
|
|
|
|
**Example:**
|
|
|
|
```bash
|
|
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
|
```
|
|
|
|
## MCP Server
|
|
|
|
Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models.
|
|
|
|
```bash
|
|
# Start MCP server with stdio transport
|
|
kreuzberg mcp --transport stdio
|
|
|
|
# Start MCP server with HTTP transport
|
|
kreuzberg mcp --transport http --host 127.0.0.1 --port 8001
|
|
```
|
|
|
|
The MCP server exposes extraction functions to AI models, allowing them to process documents directly.
|
|
|
|
## Security Limits
|
|
|
|
Set resource limits to prevent abuse and control memory/file size consumption.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
|
|
config = ExtractionConfig(
|
|
security_limits={
|
|
"max_file_size": 100_000_000, # 100 MB
|
|
"max_archive_files": 1000,
|
|
"max_text_length": 10_000_000, # 10 MB of text
|
|
"max_pages": 10000,
|
|
"max_concurrent_extractions": 4
|
|
}
|
|
)
|
|
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
|
|
const config: ExtractionConfig = {
|
|
securityLimits: {
|
|
max_file_size: 100_000_000, // 100 MB
|
|
max_archive_files: 1000,
|
|
max_text_length: 10_000_000, // 10 MB of text
|
|
max_pages: 10000,
|
|
max_concurrent_extractions: 4
|
|
}
|
|
};
|
|
|
|
const result = await extractFile("document.pdf", null, config);
|
|
```
|
|
|
|
**Common Limits:**
|
|
|
|
- `max_file_size`: Maximum input file size in bytes
|
|
- `max_archive_files`: Maximum files in archives (zip, tar, etc.)
|
|
- `max_text_length`: Maximum extracted text length
|
|
- `max_pages`: Maximum number of pages to process
|
|
- `max_concurrent_extractions`: Maximum concurrent extraction operations
|
|
|
|
## Caching
|
|
|
|
Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration.
|
|
|
|
=== "Python"
|
|
|
|
```python
|
|
from kreuzberg import ExtractionConfig, extract_file_sync
|
|
|
|
# Enable caching (default)
|
|
config = ExtractionConfig(use_cache=True)
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
|
|
# Disable caching for a specific extraction
|
|
config = ExtractionConfig(use_cache=False)
|
|
result = extract_file_sync("document.pdf", config=config)
|
|
```
|
|
|
|
=== "TypeScript"
|
|
|
|
```typescript
|
|
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
|
|
|
// Enable caching (default)
|
|
const config: ExtractionConfig = { useCache: true };
|
|
const result = await extractFile("document.pdf", null, config);
|
|
|
|
// Disable caching
|
|
const config2: ExtractionConfig = { useCache: false };
|
|
const result2 = await extractFile("document.pdf", null, config2);
|
|
```
|
|
|
|
**CLI Cache Management:**
|
|
|
|
```bash
|
|
# View cache statistics
|
|
kreuzberg cache stats
|
|
|
|
# Clear all cached results
|
|
kreuzberg cache clear
|
|
```
|
|
|
|
Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.
|