This commit is contained in:
975
skills/kreuzberg/references/advanced-features.md
Normal file
975
skills/kreuzberg/references/advanced-features.md
Normal file
@@ -0,0 +1,975 @@
|
||||
# Advanced Features Reference
|
||||
|
||||
Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems.
|
||||
|
||||
## Plugin System
|
||||
|
||||
The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results.
|
||||
|
||||
### Custom Post-Processors
|
||||
|
||||
Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged).
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
|
||||
class MetadataEnricher:
|
||||
def name(self) -> str:
|
||||
return "metadata_enricher"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
result.metadata["processed_by"] = "metadata_enricher"
|
||||
result.metadata["char_count"] = len(result.content)
|
||||
return result
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
# "early", "middle", or "late"
|
||||
return "middle"
|
||||
|
||||
def initialize(self) -> None:
|
||||
print("Initializing metadata enricher")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
print("Shutting down metadata enricher")
|
||||
|
||||
register_post_processor(MetadataEnricher())
|
||||
|
||||
# Now use extraction with the registered processor
|
||||
from kreuzberg import extract_file_sync
|
||||
result = extract_file_sync("document.pdf")
|
||||
print(result.metadata["char_count"])
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node';
|
||||
|
||||
const enricher = {
|
||||
name(): string {
|
||||
return "metadata_enricher";
|
||||
},
|
||||
|
||||
async process(result: ExtractionResult): Promise<ExtractionResult> {
|
||||
result.metadata.processed_by = "metadata_enricher";
|
||||
result.metadata.char_count = result.content.length;
|
||||
return result;
|
||||
},
|
||||
|
||||
processingStage?(): "early" | "middle" | "late" {
|
||||
return "middle";
|
||||
},
|
||||
|
||||
async initialize?(): Promise<void> {
|
||||
console.log("Initializing metadata enricher");
|
||||
},
|
||||
|
||||
async shutdown?(): Promise<void> {
|
||||
console.log("Shutting down metadata enricher");
|
||||
}
|
||||
};
|
||||
|
||||
registerPostProcessor(enricher);
|
||||
|
||||
// Now use extraction with the registered processor
|
||||
const result = await extractFile("document.pdf");
|
||||
console.log(result.metadata.char_count);
|
||||
```
|
||||
|
||||
### Custom Validators
|
||||
|
||||
Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import register_validator, ExtractionResult, ValidationError
|
||||
|
||||
class MinimumContentValidator:
|
||||
def name(self) -> str:
|
||||
return "min_content_validator"
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
if len(result.content) < 100:
|
||||
raise ValidationError("Extracted content too short (< 100 chars)")
|
||||
|
||||
def priority(self) -> int:
|
||||
# Higher priority runs first (0-1000, default 50)
|
||||
return 100
|
||||
|
||||
def should_validate(self, result: ExtractionResult) -> bool:
|
||||
# Only validate PDFs
|
||||
return "pdf" in result.mime_type.lower()
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
register_validator(MinimumContentValidator())
|
||||
|
||||
# Extraction will fail if content < 100 chars
|
||||
result = extract_file_sync("document.pdf")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { registerValidator, ExtractionResult } from '@kreuzberg/node';
|
||||
|
||||
const validator = {
|
||||
name(): string {
|
||||
return "min_content_validator";
|
||||
},
|
||||
|
||||
async validate(result: ExtractionResult): Promise<void> {
|
||||
if (result.content.length < 100) {
|
||||
throw new Error("Extracted content too short (< 100 chars)");
|
||||
}
|
||||
},
|
||||
|
||||
priority?(): number {
|
||||
return 100;
|
||||
},
|
||||
|
||||
shouldValidate?(result: ExtractionResult): boolean {
|
||||
return result.mimeType.toLowerCase().includes("pdf");
|
||||
},
|
||||
|
||||
async initialize?(): Promise<void> {},
|
||||
|
||||
async shutdown?(): Promise<void> {}
|
||||
};
|
||||
|
||||
registerValidator(validator);
|
||||
|
||||
// Extraction will fail if content < 100 chars
|
||||
const result = await extractFile("document.pdf");
|
||||
```
|
||||
|
||||
### Custom OCR Backends
|
||||
|
||||
Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import register_ocr_backend
|
||||
|
||||
class CustomOcrBackend:
|
||||
def name(self) -> str:
|
||||
return "custom_ocr"
|
||||
|
||||
def supported_languages(self) -> list[str]:
|
||||
return ["eng", "deu", "fra", "spa"]
|
||||
|
||||
def process_image(self, image_bytes: bytes, language: str) -> dict:
|
||||
# image_bytes: raw image data
|
||||
# language: ISO 639-3 code (e.g., "eng", "deu")
|
||||
|
||||
# Call your OCR engine here
|
||||
# text = my_ocr_engine.recognize(image_bytes, language)
|
||||
|
||||
return {
|
||||
"content": "Extracted text from image",
|
||||
"metadata": {"confidence": 0.95, "language": language},
|
||||
"tables": []
|
||||
}
|
||||
|
||||
def process_file(self, path: str, language: str) -> dict:
|
||||
# Optional: custom file processing
|
||||
# Called when extracting OCR from a file path
|
||||
with open(path, "rb") as f:
|
||||
image_bytes = f.read()
|
||||
return self.process_image(image_bytes, language)
|
||||
|
||||
def initialize(self) -> None:
|
||||
# Load models, initialize engine
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
# Clean up resources
|
||||
pass
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
register_ocr_backend(CustomOcrBackend())
|
||||
|
||||
# Use in extraction config
|
||||
from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="custom_ocr", language="eng")
|
||||
)
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node';
|
||||
|
||||
const backend = {
|
||||
name(): string {
|
||||
return "custom_ocr";
|
||||
},
|
||||
|
||||
supportedLanguages(): string[] {
|
||||
return ["eng", "deu", "fra", "spa"];
|
||||
},
|
||||
|
||||
async processImage(
|
||||
imageBytes: Uint8Array | string,
|
||||
language: string
|
||||
): Promise<{
|
||||
content: string;
|
||||
mime_type: string;
|
||||
metadata: Record<string, unknown>;
|
||||
tables: unknown[];
|
||||
}> {
|
||||
const buffer = typeof imageBytes === "string"
|
||||
? Buffer.from(imageBytes, "base64")
|
||||
: Buffer.from(imageBytes);
|
||||
|
||||
// Call your OCR engine
|
||||
// const text = await myOcrEngine.recognize(buffer, language);
|
||||
|
||||
return {
|
||||
content: "Extracted text from image",
|
||||
mime_type: "text/plain",
|
||||
metadata: { confidence: 0.95, language },
|
||||
tables: []
|
||||
};
|
||||
},
|
||||
|
||||
async initialize?(): Promise<void> {
|
||||
// Load models, initialize engine
|
||||
},
|
||||
|
||||
async shutdown?(): Promise<void> {
|
||||
// Clean up resources
|
||||
}
|
||||
};
|
||||
|
||||
registerOcrBackend(backend);
|
||||
|
||||
// Use in extraction config
|
||||
const config: ExtractionConfig = {
|
||||
ocr: { backend: "custom_ocr", language: "eng" }
|
||||
};
|
||||
const result = await extractFile("scanned.pdf", null, config);
|
||||
```
|
||||
|
||||
## Per-File Configuration in Batch Operations
|
||||
|
||||
Use `FileExtractionConfig` to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
batch_extract_files_sync,
|
||||
ExtractionConfig, FileExtractionConfig, OcrConfig,
|
||||
)
|
||||
|
||||
config = ExtractionConfig(output_format="markdown")
|
||||
paths = ["report.pdf", "scan.tiff"]
|
||||
file_configs = [
|
||||
None, # use batch defaults
|
||||
FileExtractionConfig(
|
||||
force_ocr=True,
|
||||
ocr=OcrConfig(backend="tesseract", language="deu"),
|
||||
),
|
||||
]
|
||||
results = batch_extract_files_sync(paths, config, file_configs=file_configs)
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { batchExtractFilesSync } from '@kreuzberg/node';
|
||||
|
||||
const results = batchExtractFilesSync(
|
||||
['report.pdf', 'scan.tiff'],
|
||||
{ outputFormat: 'markdown' },
|
||||
[null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }],
|
||||
);
|
||||
```
|
||||
|
||||
All `ExtractionConfig` fields except batch-level concerns (`max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`) can be overridden. `None`/`null` fields inherit from the batch default.
|
||||
|
||||
## Embeddings
|
||||
|
||||
Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content.
|
||||
|
||||
**Requirements:** ONNX Runtime 1.22.x or later
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
ExtractionConfig, ChunkingConfig, EmbeddingConfig,
|
||||
EmbeddingModelType, list_embedding_presets,
|
||||
get_embedding_preset, extract_file_sync
|
||||
)
|
||||
|
||||
# List available embedding presets
|
||||
presets = list_embedding_presets()
|
||||
print(f"Available presets: {presets}") # ['balanced', 'compact', 'large']
|
||||
|
||||
# Get details about a preset
|
||||
preset_info = get_embedding_preset("balanced")
|
||||
print(f"Model: {preset_info.model_name}")
|
||||
print(f"Dimensions: {preset_info.dimensions}")
|
||||
print(f"Recommended chunk size: {preset_info.chunk_size}")
|
||||
|
||||
# Method 1: Use preset (recommended)
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=512,
|
||||
max_overlap=100,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
normalize=True,
|
||||
batch_size=32
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Method 2: Use specific fastembed model
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.fastembed(
|
||||
model="BAAI/bge-small-en-v1.5",
|
||||
dimensions=384
|
||||
),
|
||||
normalize=True
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Method 3: Use custom ONNX model from HuggingFace
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.custom(
|
||||
model_id="sentence-transformers/all-MiniLM-L6-v2",
|
||||
dimensions=384
|
||||
),
|
||||
cache_dir="/path/to/model/cache"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access embeddings in chunks
|
||||
for chunk in result.chunks:
|
||||
embedding = chunk.embedding # list[float] or None
|
||||
print(f"Chunk: {chunk.content[:50]}...")
|
||||
print(f"Embedding dimensions: {len(embedding) if embedding else 0}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import {
|
||||
ExtractionConfig, ChunkingConfig,
|
||||
listEmbeddingPresets, getEmbeddingPreset,
|
||||
extractFile
|
||||
} from '@kreuzberg/node';
|
||||
|
||||
// List available embedding presets
|
||||
const presets = listEmbeddingPresets();
|
||||
console.log(`Available presets: ${presets}`); // ['balanced', 'compact', 'large']
|
||||
|
||||
// Get details about a preset
|
||||
const preset = getEmbeddingPreset("balanced");
|
||||
console.log(`Model: ${preset.modelName}`);
|
||||
console.log(`Dimensions: ${preset.dimensions}`);
|
||||
console.log(`Recommended chunk size: ${preset.chunkSize}`);
|
||||
|
||||
// Method 1: Use preset (recommended)
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 512,
|
||||
maxOverlap: 100,
|
||||
embedding: {
|
||||
model: { type: 'preset', name: 'balanced' },
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Method 2: Use specific fastembed model
|
||||
const config2: ExtractionConfig = {
|
||||
chunking: {
|
||||
embedding: {
|
||||
model: {
|
||||
type: 'fastembed',
|
||||
model: 'BAAI/bge-small-en-v1.5',
|
||||
dimensions: 384
|
||||
},
|
||||
normalize: true
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Method 3: Use custom ONNX model
|
||||
const config3: ExtractionConfig = {
|
||||
chunking: {
|
||||
embedding: {
|
||||
model: {
|
||||
type: 'custom',
|
||||
modelId: 'sentence-transformers/all-MiniLM-L6-v2',
|
||||
dimensions: 384
|
||||
},
|
||||
cacheDir: '/path/to/model/cache'
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
|
||||
// Access embeddings in chunks
|
||||
if (result.chunks) {
|
||||
for (const chunk of result.chunks) {
|
||||
const embedding = chunk.embedding; // number[] | null
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
|
||||
console.log(`Embedding dimensions: ${embedding?.length ?? 0}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Keyword Extraction
|
||||
|
||||
Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
ExtractionConfig, KeywordConfig, KeywordAlgorithm,
|
||||
YakeParams, RakeParams, extract_file_sync
|
||||
)
|
||||
|
||||
# YAKE algorithm (unsupervised, good for general use)
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.Yake,
|
||||
max_keywords=15,
|
||||
min_score=0.1,
|
||||
ngram_range=(1, 3),
|
||||
language="en",
|
||||
yake_params=YakeParams(window_size=2)
|
||||
)
|
||||
)
|
||||
|
||||
# RAKE algorithm (co-occurrence based)
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.Rake,
|
||||
max_keywords=10,
|
||||
min_score=0.0,
|
||||
language="en",
|
||||
rake_params=RakeParams(
|
||||
min_word_length=3,
|
||||
max_words_per_phrase=3
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access extracted keywords
|
||||
if result.keywords:
|
||||
for keyword in result.keywords:
|
||||
print(f"Text: {keyword.text}")
|
||||
print(f"Score: {keyword.score}")
|
||||
print(f"Algorithm: {keyword.algorithm}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import {
|
||||
ExtractionConfig, KeywordConfig,
|
||||
extractFile
|
||||
} from '@kreuzberg/node';
|
||||
|
||||
// YAKE algorithm
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 15,
|
||||
minScore: 0.1,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: {
|
||||
windowSize: 2
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// RAKE algorithm
|
||||
const config2: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "rake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.0,
|
||||
language: "en",
|
||||
rakeParams: {
|
||||
minWordLength: 3,
|
||||
maxWordsPerPhrase: 3
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
|
||||
// Access extracted keywords
|
||||
if (result.keywords) {
|
||||
for (const keyword of result.keywords) {
|
||||
console.log(`Text: ${keyword.text}`);
|
||||
console.log(`Score: ${keyword.score}`);
|
||||
console.log(`Algorithm: ${keyword.algorithm}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Language Detection
|
||||
|
||||
Automatically detect the language(s) in documents using ISO 639-1 language codes.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
ExtractionConfig, LanguageDetectionConfig,
|
||||
extract_file_sync
|
||||
)
|
||||
|
||||
# Enable language detection
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("multilingual.pdf", config=config)
|
||||
|
||||
# Access detected languages
|
||||
if result.detected_languages:
|
||||
for lang_code in result.detected_languages:
|
||||
print(f"Detected language: {lang_code}") # e.g., "en", "de", "fr"
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import {
|
||||
ExtractionConfig, LanguageDetectionConfig,
|
||||
extractFile
|
||||
} from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("multilingual.pdf", null, config);
|
||||
|
||||
// Access detected languages
|
||||
if (result.detectedLanguages) {
|
||||
for (const langCode of result.detectedLanguages) {
|
||||
console.log(`Detected language: ${langCode}`); // e.g., "en", "de", "fr"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Token Reduction
|
||||
|
||||
Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
ExtractionConfig, TokenReductionConfig,
|
||||
extract_file_sync
|
||||
)
|
||||
|
||||
# Light token reduction
|
||||
config = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="light",
|
||||
preserve_important_words=True
|
||||
)
|
||||
)
|
||||
|
||||
# Moderate reduction
|
||||
config = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate",
|
||||
preserve_important_words=True
|
||||
)
|
||||
)
|
||||
|
||||
# Aggressive reduction
|
||||
config = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="aggressive",
|
||||
preserve_important_words=True
|
||||
)
|
||||
)
|
||||
|
||||
# Maximum reduction
|
||||
config = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="maximum",
|
||||
preserve_important_words=True
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
print(f"Reduced content length: {len(result.content)}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import {
|
||||
ExtractionConfig, TokenReductionConfig,
|
||||
extractFile
|
||||
} from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
tokenReduction: {
|
||||
mode: "moderate",
|
||||
preserveImportantWords: true
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(`Reduced content length: ${result.content.length}`);
|
||||
```
|
||||
|
||||
**Token Reduction Modes:**
|
||||
|
||||
- `off`: No reduction (default)
|
||||
- `light`: Remove extra whitespace and redundant punctuation
|
||||
- `moderate`: Also remove common filler words and some formatting
|
||||
- `aggressive`: Also remove longer stopwords and collapse similar phrases
|
||||
- `maximum`: Maximum reduction while preserving semantic content
|
||||
|
||||
## Page Extraction
|
||||
|
||||
Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
ExtractionConfig, PageConfig,
|
||||
extract_file_sync
|
||||
)
|
||||
|
||||
config = ExtractionConfig(
|
||||
pages=PageConfig(
|
||||
extract_pages=True,
|
||||
insert_page_markers=True,
|
||||
marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("multi_page.pdf", config=config)
|
||||
|
||||
# Access per-page content
|
||||
if result.pages:
|
||||
for page in result.pages:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f"Content: {page.content[:100]}...")
|
||||
print(f"Tables: {len(page.tables)}")
|
||||
print(f"Images: {len(page.images)}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import {
|
||||
ExtractionConfig, PageExtractionConfig,
|
||||
extractFile
|
||||
} from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
pages: {
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("multi_page.pdf", null, config);
|
||||
|
||||
// Access per-page content
|
||||
if (result.pages) {
|
||||
for (const page of result.pages) {
|
||||
console.log(`Page ${page.pageNumber}:`);
|
||||
console.log(`Content: ${page.content.substring(0, 100)}...`);
|
||||
console.log(`Tables: ${page.tables.length}`);
|
||||
console.log(`Images: ${page.images.length}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Element-Based Output
|
||||
|
||||
Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.).
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync
|
||||
|
||||
config = ExtractionConfig(
|
||||
result_format="element_based"
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access semantic elements
|
||||
if result.elements:
|
||||
for element in result.elements:
|
||||
print(f"Type: {element.element_type}") # title, heading, narrative_text, etc.
|
||||
print(f"Text: {element.text}")
|
||||
if element.metadata.get("page_number"):
|
||||
print(f"Page: {element.metadata['page_number']}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
resultFormat: "element_based"
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
|
||||
// Access semantic elements
|
||||
if (result.elements) {
|
||||
for (const element of result.elements) {
|
||||
console.log(`Type: ${element.elementType}`);
|
||||
console.log(`Text: ${element.text}`);
|
||||
if (element.metadata.pageNumber) {
|
||||
console.log(`Page: ${element.metadata.pageNumber}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Element Types:**
|
||||
|
||||
- `title`: Document or section title
|
||||
- `heading`: Section headings
|
||||
- `narrative_text`: Regular paragraph text
|
||||
- `list_item`: Items in bullet/numbered lists
|
||||
- `table`: Table structures
|
||||
- `image`: Images or figures
|
||||
- `page_break`: Page boundaries
|
||||
- `code_block`: Code snippets
|
||||
- `block_quote`: Quoted text
|
||||
- `footer`: Footer content
|
||||
- `header`: Header content
|
||||
|
||||
## Djot Content
|
||||
|
||||
Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure).
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync
|
||||
|
||||
config = ExtractionConfig(
|
||||
output_format="djot"
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
print(result.content) # Djot-formatted content
|
||||
|
||||
# Access structured Djot content
|
||||
if result.djot_content:
|
||||
print(f"Plain text: {result.djot_content['plain_text']}")
|
||||
print(f"Blocks: {result.djot_content['blocks']}")
|
||||
print(f"Links: {result.djot_content['links']}")
|
||||
print(f"Images: {result.djot_content['images']}")
|
||||
print(f"Footnotes: {result.djot_content['footnotes']}")
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
outputFormat: "djot"
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log(result.content); // Djot-formatted content
|
||||
|
||||
// Access structured Djot content (if available)
|
||||
if (result.djotContent) {
|
||||
console.log(`Plain text: ${result.djotContent.plain_text}`);
|
||||
console.log(`Blocks: ${result.djotContent.blocks}`);
|
||||
console.log(`Links: ${result.djotContent.links}`);
|
||||
console.log(`Images: ${result.djotContent.images}`);
|
||||
console.log(`Footnotes: ${result.djotContent.footnotes}`);
|
||||
}
|
||||
```
|
||||
|
||||
## API Server
|
||||
|
||||
Run Kreuzberg as an HTTP API server for integration with external services.
|
||||
|
||||
```bash
|
||||
# Start server on default port 8000
|
||||
kreuzberg serve
|
||||
|
||||
# Custom host and port
|
||||
kreuzberg serve --host 0.0.0.0 --port 9000
|
||||
|
||||
# Enable CORS and other options
|
||||
kreuzberg serve --host localhost --port 8000
|
||||
```
|
||||
|
||||
**API Endpoints:**
|
||||
|
||||
- `POST /extract` - Extract from uploaded file
|
||||
- `POST /batch` - Batch extraction
|
||||
- `POST /detect` - Detect MIME type
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
||||
```
|
||||
|
||||
## MCP Server
|
||||
|
||||
Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models.
|
||||
|
||||
```bash
|
||||
# Start MCP server with stdio transport
|
||||
kreuzberg mcp --transport stdio
|
||||
|
||||
# Start MCP server with HTTP transport
|
||||
kreuzberg mcp --transport http --host 127.0.0.1 --port 8001
|
||||
```
|
||||
|
||||
The MCP server exposes extraction functions to AI models, allowing them to process documents directly.
|
||||
|
||||
## Security Limits
|
||||
|
||||
Set resource limits to prevent abuse and control memory/file size consumption.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig, extract_file_sync
|
||||
|
||||
config = ExtractionConfig(
|
||||
security_limits={
|
||||
"max_file_size": 100_000_000, # 100 MB
|
||||
"max_archive_files": 1000,
|
||||
"max_text_length": 10_000_000, # 10 MB of text
|
||||
"max_pages": 10000,
|
||||
"max_concurrent_extractions": 4
|
||||
}
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
securityLimits: {
|
||||
max_file_size: 100_000_000, // 100 MB
|
||||
max_archive_files: 1000,
|
||||
max_text_length: 10_000_000, // 10 MB of text
|
||||
max_pages: 10000,
|
||||
max_concurrent_extractions: 4
|
||||
}
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
```
|
||||
|
||||
**Common Limits:**
|
||||
|
||||
- `max_file_size`: Maximum input file size in bytes
|
||||
- `max_archive_files`: Maximum files in archives (zip, tar, etc.)
|
||||
- `max_text_length`: Maximum extracted text length
|
||||
- `max_pages`: Maximum number of pages to process
|
||||
- `max_concurrent_extractions`: Maximum concurrent extraction operations
|
||||
|
||||
## Caching
|
||||
|
||||
Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration.
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig, extract_file_sync
|
||||
|
||||
# Enable caching (default)
|
||||
config = ExtractionConfig(use_cache=True)
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Disable caching for a specific extraction
|
||||
config = ExtractionConfig(use_cache=False)
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
import { ExtractionConfig, extractFile } from '@kreuzberg/node';
|
||||
|
||||
// Enable caching (default)
|
||||
const config: ExtractionConfig = { useCache: true };
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
|
||||
// Disable caching
|
||||
const config2: ExtractionConfig = { useCache: false };
|
||||
const result2 = await extractFile("document.pdf", null, config2);
|
||||
```
|
||||
|
||||
**CLI Cache Management:**
|
||||
|
||||
```bash
|
||||
# View cache statistics
|
||||
kreuzberg cache stats
|
||||
|
||||
# Clear all cached results
|
||||
kreuzberg cache clear
|
||||
```
|
||||
|
||||
Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.
|
||||
449
skills/kreuzberg/references/cli-reference.md
Normal file
449
skills/kreuzberg/references/cli-reference.md
Normal file
@@ -0,0 +1,449 @@
|
||||
# Kreuzberg CLI Reference
|
||||
|
||||
Comprehensive command-line interface for the Kreuzberg document intelligence library.
|
||||
|
||||
## Installation
|
||||
|
||||
Install from crates.io:
|
||||
|
||||
```bash
|
||||
cargo install kreuzberg-cli
|
||||
```
|
||||
|
||||
Or download pre-built binaries from [GitHub Releases](https://github.com/lukasmwirth/kreuzberg/releases).
|
||||
|
||||
## Commands
|
||||
|
||||
### extract
|
||||
|
||||
Extract text and structure from a single document.
|
||||
|
||||
```bash
|
||||
kreuzberg extract <path> [FLAGS]
|
||||
```
|
||||
|
||||
**Positional Arguments**
|
||||
|
||||
- `<path>` — Path to the document file
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
|
||||
- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
|
||||
- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
|
||||
- `-m, --mime-type <type>` — MIME type hint (auto-detected if not provided).
|
||||
- `-f, --format <text|json>` — CLI output format (default: `text`). Controls how results display, not extraction content format.
|
||||
- `--content-format <plain|markdown|djot|html>` — Extraction content format (default: `plain`). Controls format of extracted content. (Note: `--output-format` is a deprecated alias.)
|
||||
- `--ocr <bool>` — Enable OCR processing.
|
||||
- `--ocr-backend <BACKEND>` — OCR backend: `tesseract`, `paddle-ocr`, `easyocr`.
|
||||
- `--ocr-language <LANG>` — OCR language code.
|
||||
- `--ocr-auto-rotate <bool>` — Auto-rotate images before OCR.
|
||||
- `--force-ocr <bool>` — Force OCR even if text extraction succeeds.
|
||||
- `--disable-ocr <bool>` — Disable OCR entirely (even for images).
|
||||
- `--no-cache <bool>` — Disable caching.
|
||||
- `--chunk <bool>` — Enable text chunking.
|
||||
- `--chunk-size <n>` — Chunk size in characters.
|
||||
- `--chunk-overlap <n>` — Chunk overlap in characters.
|
||||
- `--chunking-tokenizer <model>` — Tokenizer model for token-based sizing.
|
||||
- `--include-structure <bool>` — Include hierarchical document structure.
|
||||
- `--quality <bool>` — Enable quality processing.
|
||||
- `--detect-language <bool>` — Enable language detection.
|
||||
- `--layout` — Enable layout detection (RT-DETR v2). Use `--layout false` to disable.
|
||||
- `--layout-confidence <float>` — Layout confidence threshold (0.0-1.0).
|
||||
- `--layout-table-model <model>` — Table structure model: `tatr`, `slanet_wired`, `slanet_wireless`, `slanet_plus`, `slanet_auto`, `disabled`.
|
||||
- `--acceleration <provider>` — ONNX execution provider: `auto`, `cpu`, `coreml`, `cuda`, `tensorrt`.
|
||||
- `--extract-pages <bool>` — Extract pages as separate array.
|
||||
- `--page-markers <bool>` — Insert page marker comments.
|
||||
- `--extract-images <bool>` — Enable image extraction.
|
||||
- `--target-dpi <n>` — Target DPI for images (36-2400).
|
||||
- `--pdf-password <pass>` — Password for encrypted PDFs (repeatable).
|
||||
- `--pdf-extract-images <bool>` — Extract images from PDF pages.
|
||||
- `--pdf-extract-metadata <bool>` — Extract PDF metadata.
|
||||
- `--token-reduction <level>` — Token reduction: `off`, `light`, `moderate`, `aggressive`, `maximum`.
|
||||
- `--msg-codepage <n>` — Windows codepage fallback for MSG files.
|
||||
- `--max-concurrent <n>` — Max parallel extractions in batch mode.
|
||||
- `--max-threads <n>` — Cap all internal thread pools.
|
||||
- `--cache-namespace <name>` — Cache namespace for tenant isolation.
|
||||
- `--cache-ttl-secs <n>` — Per-request cache TTL in seconds.
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Extract with default settings
|
||||
kreuzberg extract document.pdf
|
||||
|
||||
# Extract with OCR enabled
|
||||
kreuzberg extract scanned.pdf --ocr true
|
||||
|
||||
# Extract with specific output format
|
||||
kreuzberg extract doc.docx --output-format markdown
|
||||
|
||||
# Extract with inline JSON config
|
||||
kreuzberg extract file.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
|
||||
|
||||
# Extract with base64-encoded config
|
||||
kreuzberg extract file.pdf --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
|
||||
|
||||
# Extract and output as JSON
|
||||
kreuzberg extract doc.pdf --format json
|
||||
|
||||
# Extract with chunking
|
||||
kreuzberg extract large-doc.pdf --chunk true --chunk-size 2000 --chunk-overlap 200
|
||||
|
||||
# Layout-aware markdown extraction
|
||||
kreuzberg extract document.pdf --layout --content-format markdown
|
||||
|
||||
# With custom confidence threshold
|
||||
kreuzberg extract document.pdf --layout-confidence 0.7 --content-format markdown
|
||||
```
|
||||
|
||||
### batch
|
||||
|
||||
Batch extract from multiple documents in parallel.
|
||||
|
||||
```bash
|
||||
kreuzberg batch <paths...> [FLAGS]
|
||||
```
|
||||
|
||||
**Positional Arguments**
|
||||
|
||||
- `<paths...>` — One or more document file paths
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
|
||||
- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
|
||||
- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
|
||||
- `-f, --format <text|json>` — CLI output format (default: `json`). Controls how results display, not extraction content format.
|
||||
- All extraction override flags from `extract` are also supported (e.g., `--content-format`, `--ocr`, `--layout`, `--force-ocr`, `--no-cache`, `--quality`, `--acceleration`, etc.). See the `extract` command flags for the full list.
|
||||
|
||||
**Notes**
|
||||
|
||||
- Batch command defaults to JSON output format (unlike `extract` which defaults to text).
|
||||
- Does not support `--mime-type` or `--detect-language` flags.
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Batch extract multiple PDFs
|
||||
kreuzberg batch document1.pdf document2.pdf document3.pdf
|
||||
|
||||
# Batch extract with glob patterns (shell expansion)
|
||||
kreuzberg batch *.pdf
|
||||
|
||||
# Batch extract with custom output format
|
||||
kreuzberg batch doc1.pdf doc2.pdf --output-format markdown
|
||||
|
||||
# Batch extract with OCR
|
||||
kreuzberg batch scanned*.pdf --ocr true
|
||||
|
||||
# Batch extract with text output format
|
||||
kreuzberg batch files*.docx --format text
|
||||
```
|
||||
|
||||
### detect
|
||||
|
||||
Identify MIME type of a file.
|
||||
|
||||
```bash
|
||||
kreuzberg detect <path> [FLAGS]
|
||||
```
|
||||
|
||||
**Positional Arguments**
|
||||
|
||||
- `<path>` — Path to the file
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-f, --format <text|json>` — Output format (default: `text`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Detect MIME type (text output)
|
||||
kreuzberg detect unknown-file.bin
|
||||
|
||||
# Detect MIME type (JSON output)
|
||||
kreuzberg detect file.xyz --format json
|
||||
```
|
||||
|
||||
### version
|
||||
|
||||
Display version information.
|
||||
|
||||
```bash
|
||||
kreuzberg version [FLAGS]
|
||||
```
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-f, --format <text|json>` — Output format (default: `text`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Show version as text
|
||||
kreuzberg version
|
||||
|
||||
# Show version as JSON
|
||||
kreuzberg version --format json
|
||||
```
|
||||
|
||||
### cache
|
||||
|
||||
Manage extraction cache.
|
||||
|
||||
#### cache stats
|
||||
|
||||
Display cache statistics.
|
||||
|
||||
```bash
|
||||
kreuzberg cache stats [FLAGS]
|
||||
```
|
||||
|
||||
**Flags**
|
||||
|
||||
- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
|
||||
- `-f, --format <text|json>` — Output format (default: `text`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Show cache stats
|
||||
kreuzberg cache stats
|
||||
|
||||
# Show cache stats as JSON
|
||||
kreuzberg cache stats --format json
|
||||
|
||||
# Show stats for specific cache directory
|
||||
kreuzberg cache stats --cache-dir /tmp/my-cache
|
||||
```
|
||||
|
||||
#### cache clear
|
||||
|
||||
Clear all cached extractions.
|
||||
|
||||
```bash
|
||||
kreuzberg cache clear [FLAGS]
|
||||
```
|
||||
|
||||
**Flags**
|
||||
|
||||
- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
|
||||
- `-f, --format <text|json>` — Output format (default: `text`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Clear cache
|
||||
kreuzberg cache clear
|
||||
|
||||
# Clear specific cache directory
|
||||
kreuzberg cache clear --cache-dir /tmp/my-cache
|
||||
```
|
||||
|
||||
### serve
|
||||
|
||||
Start the API server (requires `api` feature).
|
||||
|
||||
```bash
|
||||
kreuzberg serve [FLAGS]
|
||||
```
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-H, --host <host>` — Host to bind to (e.g., `127.0.0.1` or `0.0.0.0`). CLI arg overrides config file and environment variables.
|
||||
- `-p, --port <port>` — Port to bind to. CLI arg overrides config file and environment variables.
|
||||
- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
|
||||
|
||||
**Configuration Precedence**
|
||||
|
||||
1. CLI arguments (`--host`, `--port`)
|
||||
2. Environment variables (`KREUZBERG_HOST`, `KREUZBERG_PORT`)
|
||||
3. Config file (`[server]` section)
|
||||
4. Built-in defaults (`127.0.0.1:8000`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Start server with defaults
|
||||
kreuzberg serve
|
||||
|
||||
# Start server on specific host and port
|
||||
kreuzberg serve --host 0.0.0.0 --port 3000
|
||||
|
||||
# Start server with config file
|
||||
kreuzberg serve --config kreuzberg.toml
|
||||
|
||||
# Start server (environment variables override defaults)
|
||||
KREUZBERG_HOST=192.168.1.100 KREUZBERG_PORT=8080 kreuzberg serve
|
||||
```
|
||||
|
||||
### mcp
|
||||
|
||||
Start the Model Context Protocol (MCP) server (requires `mcp` feature).
|
||||
|
||||
```bash
|
||||
kreuzberg mcp [FLAGS]
|
||||
```
|
||||
|
||||
**Flags**
|
||||
|
||||
- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
|
||||
- `--transport <stdio|http>` — Transport mode (default: `stdio`)
|
||||
- `--host <host>` — HTTP host for http transport (default: `127.0.0.1`)
|
||||
- `--port <port>` — HTTP port for http transport (default: `8001`)
|
||||
|
||||
**Examples**
|
||||
|
||||
```bash
|
||||
# Start MCP server with stdio transport
|
||||
kreuzberg mcp
|
||||
|
||||
# Start MCP server with HTTP transport
|
||||
kreuzberg mcp --transport http
|
||||
|
||||
# Start MCP server on custom HTTP host/port
|
||||
kreuzberg mcp --transport http --host 0.0.0.0 --port 9000
|
||||
|
||||
# Start MCP server with config file
|
||||
kreuzberg mcp --config kreuzberg.toml
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### File Format
|
||||
|
||||
Configuration files support three formats with automatic detection:
|
||||
|
||||
- **TOML** — `.toml` extension (recommended)
|
||||
- **YAML** — `.yaml` or `.yml` extension
|
||||
- **JSON** — `.json` extension
|
||||
|
||||
### Configuration Precedence
|
||||
|
||||
Settings are applied in order from highest to lowest priority:
|
||||
|
||||
1. **Individual CLI flags** (e.g., `--ocr true`, `--output-format markdown`)
|
||||
2. **Inline JSON config** (`--config-json` or `--config-json-base64`)
|
||||
3. **Config file** (explicit `--config path.toml` or auto-discovered)
|
||||
4. **Default values** (built-in library defaults)
|
||||
|
||||
### Auto-Discovery
|
||||
|
||||
When no config file is specified, Kreuzberg searches for configuration in this order:
|
||||
|
||||
1. `kreuzberg.toml` in current directory
|
||||
2. `kreuzberg.yaml` in current directory
|
||||
3. `kreuzberg.json` in current directory
|
||||
4. Parent directories (same search pattern, up to filesystem root)
|
||||
|
||||
### Example Configuration
|
||||
|
||||
```toml
|
||||
# Top-level extraction options
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
force_ocr = false
|
||||
output_format = "markdown"
|
||||
|
||||
# OCR settings
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
|
||||
# Chunking settings
|
||||
[chunking]
|
||||
max_chars = 2000
|
||||
max_overlap = 200
|
||||
|
||||
# Language detection
|
||||
[language_detection]
|
||||
enabled = true
|
||||
|
||||
# Server configuration (for serve command)
|
||||
[server]
|
||||
host = "127.0.0.1"
|
||||
port = 8000
|
||||
```
|
||||
|
||||
## Exit Codes
|
||||
|
||||
- `0` — Success
|
||||
- Non-zero — Error (see stderr for details)
|
||||
|
||||
## Error Handling
|
||||
|
||||
The CLI validates input and provides clear error messages:
|
||||
|
||||
- **File not found** — Verify path exists and is readable
|
||||
- **Invalid MIME type** — Ensure file is accessible and format is supported
|
||||
- **Invalid JSON** — Check `--config-json` syntax
|
||||
- **Invalid config file** — Verify TOML/YAML/JSON format
|
||||
- **Invalid chunk parameters** — Ensure chunk-size > 0 and overlap < chunk-size
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `RUST_LOG` — Set logging level (e.g., `RUST_LOG=debug`)
|
||||
- `KREUZBERG_HOST` — Server bind host (used by `serve` command)
|
||||
- `KREUZBERG_PORT` — Server bind port (used by `serve` command)
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Extract with Custom Configuration
|
||||
|
||||
```bash
|
||||
kreuzberg extract document.pdf \
|
||||
--content-format markdown \
|
||||
--ocr true \
|
||||
--quality true
|
||||
```
|
||||
|
||||
### Batch Process with Config File
|
||||
|
||||
```bash
|
||||
kreuzberg batch *.pdf --config extraction-config.toml
|
||||
```
|
||||
|
||||
### CI/CD Integration
|
||||
|
||||
```bash
|
||||
# Extract to JSON for downstream processing
|
||||
kreuzberg extract file.pdf --format json | jq '.content'
|
||||
|
||||
# Batch process with error handling
|
||||
kreuzberg batch docs/*.pdf --format json || exit 1
|
||||
```
|
||||
|
||||
### Performance Tuning
|
||||
|
||||
```bash
|
||||
# Disable cache for temporary processing
|
||||
kreuzberg extract file.pdf --no-cache true
|
||||
|
||||
# Enable chunking for large documents
|
||||
kreuzberg extract large-file.pdf \
|
||||
--chunk true \
|
||||
--chunk-size 5000 \
|
||||
--chunk-overlap 500
|
||||
```
|
||||
|
||||
## Debugging
|
||||
|
||||
Enable detailed logging:
|
||||
|
||||
```bash
|
||||
RUST_LOG=debug kreuzberg extract document.pdf
|
||||
```
|
||||
|
||||
Check cache statistics:
|
||||
|
||||
```bash
|
||||
kreuzberg cache stats --format json
|
||||
```
|
||||
|
||||
Detect file MIME type:
|
||||
|
||||
```bash
|
||||
kreuzberg detect unknown-file --format json
|
||||
```
|
||||
417
skills/kreuzberg/references/configuration.md
Normal file
417
skills/kreuzberg/references/configuration.md
Normal file
@@ -0,0 +1,417 @@
|
||||
# Configuration Reference
|
||||
|
||||
Kreuzberg uses a hierarchical configuration system supporting multiple formats and auto-discovery mechanisms. This reference covers all available configuration options, field names across programming languages, and loading strategies.
|
||||
|
||||
## Supported Formats
|
||||
|
||||
Kreuzberg configurations can be defined in three formats:
|
||||
|
||||
- **TOML** (recommended): `kreuzberg.toml`
|
||||
- **YAML**: `kreuzberg.yaml`
|
||||
- **JSON**: `kreuzberg.json`
|
||||
|
||||
All formats support the same schema and configuration options.
|
||||
|
||||
## Auto-Discovery
|
||||
|
||||
When no configuration file is explicitly specified, Kreuzberg searches for configuration files in the following order:
|
||||
|
||||
1. Current working directory: `kreuzberg.toml`, `kreuzberg.yaml`, `kreuzberg.json`
|
||||
2. Parent directories (recursively up the tree, same file name pattern)
|
||||
|
||||
The first matching configuration file is loaded.
|
||||
|
||||
## Programmatic Loading
|
||||
|
||||
### Python
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig
|
||||
|
||||
# Load from explicit path
|
||||
config = ExtractionConfig.from_file("kreuzberg.toml")
|
||||
|
||||
# Auto-discover configuration
|
||||
config = ExtractionConfig.discover()
|
||||
```
|
||||
|
||||
### Node.js / TypeScript
|
||||
|
||||
```typescript
|
||||
import { ExtractionConfig } from "@kreuzberg/node";
|
||||
|
||||
// Load from explicit path
|
||||
const config = ExtractionConfig.fromFile("kreuzberg.toml");
|
||||
|
||||
// Auto-discover configuration
|
||||
const config = ExtractionConfig.discover();
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
# Explicit configuration file
|
||||
kreuzberg extract --config kreuzberg.toml document.pdf
|
||||
|
||||
# Auto-discovery (searches default locations)
|
||||
kreuzberg extract document.pdf
|
||||
```
|
||||
|
||||
## Configuration Schema
|
||||
|
||||
The complete TOML schema with all available sections and options:
|
||||
|
||||
### Top-Level Options
|
||||
|
||||
```toml
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
force_ocr = false
|
||||
output_format = "markdown"
|
||||
result_format = "text"
|
||||
max_concurrent_extractions = 4
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| ---------------------------- | ------- | ------------ | ----------------------------------------------------------------------------------- |
|
||||
| `use_cache` | boolean | `true` | Enable caching of extraction results |
|
||||
| `enable_quality_processing` | boolean | `true` | Enable post-processing for output quality |
|
||||
| `force_ocr` | boolean | `false` | Force OCR processing even for searchable PDFs |
|
||||
| `disable_ocr` | boolean | `false` | Disable OCR entirely — image files return empty content instead of errors (v4.7.0+) |
|
||||
| `output_format` | string | `"markdown"` | Output format (markdown, html, text) |
|
||||
| `result_format` | string | `"text"` | Result format for structured output |
|
||||
| `max_concurrent_extractions` | integer | `4` | Maximum concurrent document extractions |
|
||||
|
||||
### OCR Configuration
|
||||
|
||||
```toml
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| ---------- | ------ | ------------- | --------------------------------------------- |
|
||||
| `backend` | string | `"tesseract"` | OCR backend (currently tesseract) |
|
||||
| `language` | string | `"eng"` | ISO 639-3 language code (eng, deu, fra, etc.) |
|
||||
|
||||
#### Tesseract Configuration
|
||||
|
||||
```toml
|
||||
[ocr.tesseract_config]
|
||||
psm = 3
|
||||
oem = 3
|
||||
min_confidence = 0.0
|
||||
output_format = "text"
|
||||
enable_table_detection = false
|
||||
table_min_confidence = 0.5
|
||||
table_column_threshold = 50
|
||||
table_row_threshold_ratio = 0.5
|
||||
use_cache = true
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| --------------------------- | ------- | -------- | ------------------------------------------ |
|
||||
| `psm` | integer | `3` | Page Segmentation Mode (0-13) |
|
||||
| `oem` | integer | `3` | OCR Engine Mode (0-3) |
|
||||
| `min_confidence` | float | `0.0` | Minimum OCR confidence threshold (0.0-1.0) |
|
||||
| `output_format` | string | `"text"` | Output format from OCR |
|
||||
| `enable_table_detection` | boolean | `false` | Enable table detection during OCR |
|
||||
| `table_min_confidence` | float | `0.5` | Minimum confidence for table cells |
|
||||
| `table_column_threshold` | integer | `50` | Pixel threshold for column detection |
|
||||
| `table_row_threshold_ratio` | float | `0.5` | Row height ratio threshold |
|
||||
| `use_cache` | boolean | `true` | Cache OCR results |
|
||||
|
||||
#### Tesseract Preprocessing
|
||||
|
||||
```toml
|
||||
[ocr.tesseract_config.preprocessing]
|
||||
target_dpi = 300
|
||||
auto_rotate = true
|
||||
deskew = true
|
||||
denoise = true
|
||||
contrast_enhance = true
|
||||
binarization_method = "otsu"
|
||||
invert_colors = false
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| --------------------- | ------- | -------- | ---------------------------------------------- |
|
||||
| `target_dpi` | integer | `300` | Target DPI for preprocessing |
|
||||
| `auto_rotate` | boolean | `true` | Automatically detect and correct page rotation |
|
||||
| `deskew` | boolean | `true` | Correct skewed pages |
|
||||
| `denoise` | boolean | `true` | Remove noise from images |
|
||||
| `contrast_enhance` | boolean | `true` | Enhance image contrast |
|
||||
| `binarization_method` | string | `"otsu"` | Method for image binarization |
|
||||
| `invert_colors` | boolean | `false` | Invert image colors if needed |
|
||||
|
||||
### PDF Options
|
||||
|
||||
```toml
|
||||
[pdf_options]
|
||||
extract_images = true
|
||||
extract_metadata = true
|
||||
|
||||
[pdf_options.hierarchy]
|
||||
enabled = true
|
||||
k_clusters = 6
|
||||
include_bbox = true
|
||||
ocr_coverage_threshold = 0.5
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| ---------------------------------- | ------- | ------- | ---------------------------------------------- |
|
||||
| `extract_images` | boolean | `true` | Extract images from PDF documents |
|
||||
| `extract_metadata` | boolean | `true` | Extract PDF metadata |
|
||||
| `hierarchy.enabled` | boolean | `true` | Enable PDF hierarchy extraction (v4.0.0+) |
|
||||
| `hierarchy.k_clusters` | integer | `6` | Number of clusters for hierarchy detection |
|
||||
| `hierarchy.include_bbox` | boolean | `true` | Include bounding boxes in hierarchy |
|
||||
| `hierarchy.ocr_coverage_threshold` | float | `0.5` | OCR coverage threshold for hierarchy (0.0-1.0) |
|
||||
|
||||
### Image Processing
|
||||
|
||||
```toml
|
||||
[images]
|
||||
extract_images = true
|
||||
target_dpi = 300
|
||||
max_image_dimension = 4096
|
||||
auto_adjust_dpi = true
|
||||
min_dpi = 72
|
||||
max_dpi = 600
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| --------------------- | ------- | ------- | -------------------------------------------- |
|
||||
| `extract_images` | boolean | `true` | Extract images from documents |
|
||||
| `target_dpi` | integer | `300` | Target DPI for image processing |
|
||||
| `max_image_dimension` | integer | `4096` | Maximum image dimension in pixels |
|
||||
| `auto_adjust_dpi` | boolean | `true` | Automatically adjust DPI based on image size |
|
||||
| `min_dpi` | integer | `72` | Minimum DPI threshold |
|
||||
| `max_dpi` | integer | `600` | Maximum DPI threshold |
|
||||
|
||||
### Chunking Configuration
|
||||
|
||||
```toml
|
||||
[chunking]
|
||||
max_chars = 1000
|
||||
max_overlap = 200
|
||||
|
||||
[chunking.embedding]
|
||||
batch_size = 32
|
||||
normalize = true
|
||||
show_download_progress = true
|
||||
cache_dir = "~/.cache/kreuzberg/embeddings"
|
||||
|
||||
[chunking.embedding.model]
|
||||
type = "preset"
|
||||
name = "balanced"
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| ---------------------------------- | ------- | --------------------------------- | ---------------------------------------------------------- |
|
||||
| `max_chars` | integer | `1000` | Maximum characters per chunk |
|
||||
| `max_overlap` | integer | `200` | Overlap between consecutive chunks |
|
||||
| `embedding.batch_size` | integer | `32` | Batch size for embedding generation |
|
||||
| `embedding.normalize` | boolean | `true` | Normalize embeddings to unit length |
|
||||
| `embedding.show_download_progress` | boolean | `true` | Show progress when downloading models |
|
||||
| `embedding.cache_dir` | string | `"~/.cache/kreuzberg/embeddings"` | Directory for caching embeddings |
|
||||
| `embedding.model.type` | string | `"preset"` | Model type: preset, fastembed, or custom |
|
||||
| `embedding.model.name` | string | `"balanced"` | Preset model name (balanced, fast, accurate, multilingual) |
|
||||
| `embedding.model.model` | string | | FastEmbed model identifier |
|
||||
| `embedding.model.model_id` | string | | Custom HuggingFace model ID |
|
||||
| `embedding.model.dimensions` | integer | | Embedding dimensions |
|
||||
|
||||
### Keywords Configuration
|
||||
|
||||
```toml
|
||||
[keywords]
|
||||
algorithm = "yake"
|
||||
max_keywords = 10
|
||||
min_score = 0.0
|
||||
ngram_range = [1, 3]
|
||||
language = "en"
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| -------------- | ------- | -------- | ------------------------------------------- |
|
||||
| `algorithm` | string | `"yake"` | Keyword extraction algorithm (yake or rake) |
|
||||
| `max_keywords` | integer | `10` | Maximum keywords to extract |
|
||||
| `min_score` | float | `0.0` | Minimum relevance score for keywords |
|
||||
| `ngram_range` | array | `[1, 3]` | N-gram size range [min, max] |
|
||||
| `language` | string | `"en"` | Language code for keyword extraction |
|
||||
|
||||
### Token Reduction
|
||||
|
||||
```toml
|
||||
[token_reduction]
|
||||
mode = "off"
|
||||
preserve_important_words = true
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| -------------------------- | ------- | ------- | ----------------------------------------- |
|
||||
| `mode` | string | `"off"` | Mode: off, aggressive, moderate, minimal |
|
||||
| `preserve_important_words` | boolean | `true` | Preserve important words during reduction |
|
||||
|
||||
### Language Detection
|
||||
|
||||
```toml
|
||||
[language_detection]
|
||||
enabled = true
|
||||
min_confidence = 0.8
|
||||
detect_multiple = false
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| ----------------- | ------- | ------- | ------------------------------------------ |
|
||||
| `enabled` | boolean | `true` | Enable automatic language detection |
|
||||
| `min_confidence` | float | `0.8` | Minimum confidence threshold for detection |
|
||||
| `detect_multiple` | boolean | `false` | Detect multiple languages in document |
|
||||
|
||||
### Post-Processor
|
||||
|
||||
```toml
|
||||
[postprocessor]
|
||||
enabled = true
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
| --------- | ------- | ------- | ------------------------------------------- |
|
||||
| `enabled` | boolean | `true` | Enable post-processing of extracted content |
|
||||
|
||||
## FileExtractionConfig (Per-File Overrides)
|
||||
|
||||
Passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants) to override settings per file in a batch. All fields optional — `None` = use batch default. The separate `_with_configs` functions were removed in v4.5.0.
|
||||
|
||||
**Overridable fields:** `enable_quality_processing`, `ocr`, `force_ocr`, `chunking`, `images`, `pdf_options`, `token_reduction`, `language_detection`, `pages`, `keywords`, `postprocessor`, `html_options`, `result_format`, `output_format`, `include_document_structure`, `layout`.
|
||||
|
||||
**Batch-level only (not overridable):** `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
|
||||
|
||||
**Merge semantics:** For each file, `FileExtractionConfig` fields are overlaid on the batch `ExtractionConfig`. `None` falls through to batch default; `Some(value)` replaces the batch default for that file.
|
||||
|
||||
```toml
|
||||
# FileExtractionConfig cannot be specified in config files —
|
||||
# it is a programmatic API for per-file overrides at runtime.
|
||||
```
|
||||
|
||||
## Naming Conventions
|
||||
|
||||
Kreuzberg uses consistent naming conventions across different contexts:
|
||||
|
||||
| Context | Convention | Example |
|
||||
| -------------------- | ---------- | --------------------------------------------- |
|
||||
| Python | snake_case | `max_chars`, `pdf_options`, `use_cache` |
|
||||
| Node.js / TypeScript | camelCase | `maxChars`, `pdfOptions`, `useCache` |
|
||||
| Rust | snake_case | `max_chars`, `pdf_options`, `use_cache` |
|
||||
| TOML / YAML / JSON | snake_case | `max_chars`, `pdf_options`, `use_cache` |
|
||||
| CLI flags | kebab-case | `--max-chars`, `--pdf-options`, `--use-cache` |
|
||||
|
||||
When switching between languages, apply the appropriate conversion:
|
||||
|
||||
- Python → Node.js: `snake_case` to `camelCase`
|
||||
- CLI → Python: `kebab-case` to `snake_case`
|
||||
- TOML → Python: No conversion needed (both use `snake_case`)
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The following environment variables can override configuration:
|
||||
|
||||
| Variable | Purpose | Example |
|
||||
| ---------------- | ----------------------------------- | ----------- |
|
||||
| `KREUZBERG_HOST` | Server bind address (serve command) | `127.0.0.1` |
|
||||
| `KREUZBERG_PORT` | Server port (serve command) | `8080` |
|
||||
|
||||
## Configuration Merging
|
||||
|
||||
Configuration sources are merged in priority order (highest to lowest):
|
||||
|
||||
1. **CLI flags** (highest priority)
|
||||
2. **Inline JSON configuration** (programmatic)
|
||||
3. **Configuration file** (lowest priority)
|
||||
|
||||
Later sources override earlier ones. For example, a CLI flag `--max-chars 2000` overrides `max_chars = 1000` in the configuration file.
|
||||
|
||||
## Example Configurations
|
||||
|
||||
### Minimal Configuration
|
||||
|
||||
```toml
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
```
|
||||
|
||||
### High-Quality PDF Extraction
|
||||
|
||||
```toml
|
||||
use_cache = true
|
||||
enable_quality_processing = true
|
||||
force_ocr = false
|
||||
|
||||
[ocr]
|
||||
backend = "tesseract"
|
||||
language = "eng"
|
||||
|
||||
[ocr.tesseract_config]
|
||||
psm = 3
|
||||
oem = 3
|
||||
enable_table_detection = true
|
||||
table_min_confidence = 0.7
|
||||
|
||||
[pdf_options]
|
||||
extract_images = true
|
||||
extract_metadata = true
|
||||
|
||||
[pdf_options.hierarchy]
|
||||
enabled = true
|
||||
k_clusters = 6
|
||||
|
||||
[images]
|
||||
extract_images = true
|
||||
target_dpi = 300
|
||||
```
|
||||
|
||||
### Semantic Search Configuration
|
||||
|
||||
```toml
|
||||
[chunking]
|
||||
max_chars = 800
|
||||
max_overlap = 150
|
||||
|
||||
[chunking.embedding]
|
||||
batch_size = 32
|
||||
normalize = true
|
||||
cache_dir = "~/.cache/kreuzberg/embeddings"
|
||||
|
||||
[chunking.embedding.model]
|
||||
type = "preset"
|
||||
name = "accurate"
|
||||
|
||||
[keywords]
|
||||
algorithm = "yake"
|
||||
max_keywords = 15
|
||||
```
|
||||
|
||||
## Field Name Reference
|
||||
|
||||
Critical field names to use in configuration files:
|
||||
|
||||
- `max_chars` (NOT `max_characters`)
|
||||
- `max_overlap` (NOT `overlap`)
|
||||
- `table_min_confidence`
|
||||
- `table_column_threshold`
|
||||
- `table_row_threshold_ratio`
|
||||
- `ocr_coverage_threshold`
|
||||
- `k_clusters`
|
||||
- `include_bbox`
|
||||
- `enable_table_detection`
|
||||
- `auto_rotate`
|
||||
- `auto_adjust_dpi`
|
||||
- `show_download_progress`
|
||||
- `min_confidence`
|
||||
- `detect_multiple`
|
||||
|
||||
Always verify field names against the source configuration file when adding new options.
|
||||
1380
skills/kreuzberg/references/nodejs-api.md
Normal file
1380
skills/kreuzberg/references/nodejs-api.md
Normal file
File diff suppressed because it is too large
Load Diff
212
skills/kreuzberg/references/other-bindings.md
Normal file
212
skills/kreuzberg/references/other-bindings.md
Normal file
@@ -0,0 +1,212 @@
|
||||
# Language Bindings Reference
|
||||
|
||||
Kreuzberg provides native bindings for multiple programming languages, each with precompiled binaries for x86_64 and aarch64 on Linux and macOS. This reference covers installation and basic usage for each binding.
|
||||
|
||||
## Go
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
go get github.com/kreuzberg-dev/kreuzberg/packages/go/v5
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5/kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
ctx := context.Background()
|
||||
result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println(result.Content)
|
||||
}
|
||||
```
|
||||
|
||||
See the [Go binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go) for complete API reference.
|
||||
|
||||
## Ruby
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
gem install kreuzberg
|
||||
```
|
||||
|
||||
Or in your Gemfile:
|
||||
|
||||
```ruby
|
||||
gem 'kreuzberg'
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```ruby
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
puts result.content
|
||||
```
|
||||
|
||||
See the [Ruby binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/ruby) for complete API reference.
|
||||
|
||||
## Java
|
||||
|
||||
**Installation:**
|
||||
Add to your Maven `pom.xml`:
|
||||
|
||||
```xml
|
||||
<dependency>
|
||||
<groupId>dev.kreuzberg</groupId>
|
||||
<artifactId>kreuzberg</artifactId>
|
||||
<version>4.2.x</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```java
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
|
||||
public class Example {
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
System.out.println(result.getContent());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
See the [Java binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/java) for complete API reference.
|
||||
|
||||
## C
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
dotnet add package Kreuzberg
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```csharp
|
||||
using Kreuzberg;
|
||||
|
||||
var result = KreuzbergClient.ExtractFileSync("document.pdf");
|
||||
Console.WriteLine(result.Content);
|
||||
```
|
||||
|
||||
See the [C# binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/csharp) for complete API reference.
|
||||
|
||||
## PHP
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
composer require kreuzberg/kreuzberg
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```php
|
||||
<?php
|
||||
require 'vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
echo $result->content;
|
||||
```
|
||||
|
||||
See the [PHP binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/php) for complete API reference.
|
||||
|
||||
## Elixir
|
||||
|
||||
**Installation:**
|
||||
Add to your `mix.exs` dependencies:
|
||||
|
||||
```elixir
|
||||
def deps do
|
||||
[
|
||||
kreuzberg: "~> 4.2"
|
||||
]
|
||||
end
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```elixir
|
||||
{:ok, result} = Kreuzberg.extract_file("document.pdf")
|
||||
IO.puts(result.content)
|
||||
```
|
||||
|
||||
See the [Elixir binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/elixir) for complete API reference.
|
||||
|
||||
## WebAssembly (WASM)
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
npm install @kreuzberg/wasm
|
||||
```
|
||||
|
||||
**Basic Extraction:**
|
||||
|
||||
```typescript
|
||||
import { extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
const fileData = await fs.promises.readFile("document.pdf");
|
||||
const result = await extractBytes(fileData, "application/pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
|
||||
Supports browsers, Deno, and Cloudflare Workers.
|
||||
|
||||
See the [WASM binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/typescript) for complete API reference.
|
||||
|
||||
## Docker
|
||||
|
||||
**Installation:**
|
||||
Pull the official image from GitHub Container Registry:
|
||||
|
||||
```bash
|
||||
docker pull ghcr.io/kreuzberg-dev/kreuzberg
|
||||
```
|
||||
|
||||
**API Server Mode:**
|
||||
|
||||
```bash
|
||||
docker run -p 8000:8000 ghcr.io/kreuzberg-dev/kreuzberg serve --host 0.0.0.0
|
||||
```
|
||||
|
||||
**CLI Mode:**
|
||||
|
||||
```bash
|
||||
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg extract /data/document.pdf
|
||||
```
|
||||
|
||||
**MCP Server Mode:**
|
||||
|
||||
```bash
|
||||
docker run -i ghcr.io/kreuzberg-dev/kreuzberg mcp
|
||||
```
|
||||
|
||||
Image sizes:
|
||||
|
||||
- Core image: 1.0-1.3GB
|
||||
- Full image: ~1.0-1.3GB
|
||||
|
||||
See the [Docker guide](https://docs.kreuzberg.dev/guides/docker/) for deployment details.
|
||||
|
||||
## Platform Support
|
||||
|
||||
All language bindings include precompiled binaries for x86_64 and aarch64 on Linux and macOS. Windows support varies by binding. Refer to the main [README](https://github.com/kreuzberg-dev/kreuzberg) for platform compatibility matrix.
|
||||
1440
skills/kreuzberg/references/python-api.md
Normal file
1440
skills/kreuzberg/references/python-api.md
Normal file
File diff suppressed because it is too large
Load Diff
866
skills/kreuzberg/references/rust-api.md
Normal file
866
skills/kreuzberg/references/rust-api.md
Normal file
@@ -0,0 +1,866 @@
|
||||
# Kreuzberg Rust API Reference
|
||||
|
||||
Complete API reference for the Kreuzberg document extraction library in Rust.
|
||||
|
||||
## Setup
|
||||
|
||||
Add to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
kreuzberg = { version = "4", features = [
|
||||
"tokio-runtime",
|
||||
"pdf",
|
||||
"ocr",
|
||||
"chunking",
|
||||
"embeddings",
|
||||
"language-detection",
|
||||
"keywords-yake",
|
||||
"keywords-rake",
|
||||
"api",
|
||||
"mcp"
|
||||
] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
```
|
||||
|
||||
### Core Features
|
||||
|
||||
- **tokio-runtime**: Enables async/sync extraction (default). Required for `extract_file_sync`, `batch_extract_file_sync`, `batch_extract_file`
|
||||
- **pdf**: PDF extraction with PDFium
|
||||
- **ocr**: Tesseract-based OCR for scanned documents
|
||||
- **chunking**: Text chunking for RAG pipelines
|
||||
- **embeddings**: Vector embeddings generation
|
||||
- **language-detection**: Detect document language
|
||||
- **keywords-yake** / **keywords-rake**: Extract keywords using YAKE or RAKE
|
||||
- **api**: HTTP API with Axum
|
||||
- **mcp**: Model Context Protocol support
|
||||
|
||||
---
|
||||
|
||||
## Core Extraction Functions
|
||||
|
||||
### `extract_file` (async)
|
||||
|
||||
Extract content from a file path.
|
||||
|
||||
```rust
|
||||
pub async fn extract_file(
|
||||
path: impl AsRef<Path>,
|
||||
mime_type: Option<&str>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult>
|
||||
```
|
||||
|
||||
**Always available.** Requires async context (`#[tokio::main]`, `tokio::spawn`, etc.).
|
||||
|
||||
```rust
|
||||
use kreuzberg::{extract_file, ExtractionConfig};
|
||||
use std::path::Path;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
println!("Content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `extract_bytes` (async)
|
||||
|
||||
Extract content from byte data.
|
||||
|
||||
```rust
|
||||
pub async fn extract_bytes(
|
||||
data: &[u8],
|
||||
mime_type: &str,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult>
|
||||
```
|
||||
|
||||
**Always available.** Requires async context.
|
||||
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let pdf_bytes = std::fs::read("document.pdf")?;
|
||||
let result = extract_bytes(&pdf_bytes, "application/pdf", &config).await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `extract_file_sync` (sync)
|
||||
|
||||
Synchronous wrapper around `extract_file`.
|
||||
|
||||
```rust
|
||||
pub fn extract_file_sync(
|
||||
path: impl AsRef<Path>,
|
||||
mime_type: Option<&str>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult>
|
||||
```
|
||||
|
||||
**Requires tokio-runtime feature.** Blocks the current thread using a global Tokio runtime.
|
||||
|
||||
```rust
|
||||
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
||||
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
println!("Content: {}", result.content);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `extract_bytes_sync` (sync)
|
||||
|
||||
Synchronous wrapper around `extract_bytes`.
|
||||
|
||||
```rust
|
||||
pub fn extract_bytes_sync(
|
||||
content: &[u8],
|
||||
mime_type: &str,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult>
|
||||
```
|
||||
|
||||
**Always available.** Works in sync and async contexts.
|
||||
|
||||
```rust
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let bytes = b"Hello, world!";
|
||||
let result = extract_bytes_sync(bytes, "text/plain", &config)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `batch_extract_file` (async, parallel)
|
||||
|
||||
Extract multiple files concurrently.
|
||||
|
||||
```rust
|
||||
pub async fn batch_extract_file(
|
||||
paths: Vec<impl AsRef<Path>>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>>
|
||||
```
|
||||
|
||||
**Requires tokio-runtime feature.** Processes files in parallel with automatic concurrency management (defaults to `num_cpus * 1.5`).
|
||||
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let paths = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
|
||||
let results = batch_extract_file(paths, &config).await?;
|
||||
println!("Processed {} files", results.len());
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `batch_extract_bytes` (async, parallel)
|
||||
|
||||
Extract multiple byte arrays concurrently.
|
||||
|
||||
```rust
|
||||
pub async fn batch_extract_bytes(
|
||||
contents: Vec<(Vec<u8>, String)>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>>
|
||||
```
|
||||
|
||||
**Requires tokio-runtime feature.** Each tuple is `(bytes, mime_type)`.
|
||||
|
||||
```rust
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let contents = vec![
|
||||
(b"PDF content".to_vec(), "application/pdf".to_string()),
|
||||
(b"Text content".to_vec(), "text/plain".to_string()),
|
||||
];
|
||||
let results = batch_extract_bytes(contents, &config).await?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `batch_extract_file_sync` (sync, parallel)
|
||||
|
||||
Synchronous wrapper for batch file extraction.
|
||||
|
||||
```rust
|
||||
pub fn batch_extract_file_sync(
|
||||
paths: Vec<impl AsRef<Path>>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>>
|
||||
```
|
||||
|
||||
**Requires tokio-runtime feature.** Uses global runtime for concurrency.
|
||||
|
||||
```rust
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let paths = vec!["doc1.pdf", "doc2.pdf"];
|
||||
let results = batch_extract_file_sync(paths, &config)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `batch_extract_bytes_sync` (sync, parallel)
|
||||
|
||||
Synchronous wrapper for batch byte extraction.
|
||||
|
||||
```rust
|
||||
pub fn batch_extract_bytes_sync(
|
||||
contents: Vec<(Vec<u8>, String)>,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<Vec<ExtractionResult>>
|
||||
```
|
||||
|
||||
**Always available.** Each tuple is `(bytes, mime_type)`.
|
||||
|
||||
```rust
|
||||
fn main() -> kreuzberg::Result<()> {
|
||||
let config = ExtractionConfig::default();
|
||||
let contents = vec![
|
||||
(b"content 1".to_vec(), "text/plain".to_string()),
|
||||
(b"content 2".to_vec(), "text/plain".to_string()),
|
||||
];
|
||||
let results = batch_extract_bytes_sync(contents, &config)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
### `FileExtractionConfig`
|
||||
|
||||
Per-file overrides for batch operations, passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants). All fields `Option<T>` — `None` = use batch default.
|
||||
|
||||
> **Note (v4.5.0):** The separate `batch_extract_file_with_configs` / `batch_extract_bytes_with_configs` functions have been removed. Per-file configs are now an optional parameter on the unified batch functions.
|
||||
|
||||
```rust
|
||||
pub struct FileExtractionConfig {
|
||||
pub enable_quality_processing: Option<bool>,
|
||||
pub ocr: Option<OcrConfig>,
|
||||
pub force_ocr: Option<bool>,
|
||||
pub chunking: Option<ChunkingConfig>,
|
||||
pub images: Option<ImageExtractionConfig>,
|
||||
pub pdf_options: Option<PdfConfig>,
|
||||
pub token_reduction: Option<TokenReductionConfig>,
|
||||
pub language_detection: Option<LanguageDetectionConfig>,
|
||||
pub pages: Option<PageConfig>,
|
||||
pub postprocessor: Option<PostProcessorConfig>,
|
||||
pub output_format: Option<OutputFormat>,
|
||||
pub include_document_structure: Option<bool>,
|
||||
}
|
||||
```
|
||||
|
||||
Excluded batch-level fields: `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### `ExtractionConfig`
|
||||
|
||||
Main configuration struct for all extraction operations.
|
||||
|
||||
```rust
|
||||
pub struct ExtractionConfig {
|
||||
/// Enable caching (default: true)
|
||||
pub use_cache: bool,
|
||||
|
||||
/// Enable quality post-processing (default: true)
|
||||
pub enable_quality_processing: bool,
|
||||
|
||||
/// OCR configuration (None = OCR disabled)
|
||||
pub ocr: Option<OcrConfig>,
|
||||
|
||||
/// Force OCR even for searchable PDFs (default: false)
|
||||
pub force_ocr: bool,
|
||||
|
||||
/// Text chunking configuration (None = disabled)
|
||||
pub chunking: Option<ChunkingConfig>,
|
||||
|
||||
/// Image extraction configuration (None = disabled)
|
||||
pub images: Option<ImageExtractionConfig>,
|
||||
|
||||
/// PDF-specific options (requires pdf feature)
|
||||
#[cfg(feature = "pdf")]
|
||||
pub pdf_options: Option<PdfConfig>,
|
||||
|
||||
/// Token reduction configuration (None = disabled)
|
||||
pub token_reduction: Option<TokenReductionConfig>,
|
||||
|
||||
/// Language detection configuration (None = disabled)
|
||||
pub language_detection: Option<LanguageDetectionConfig>,
|
||||
|
||||
/// Page extraction configuration (None = disabled)
|
||||
pub pages: Option<PageConfig>,
|
||||
|
||||
/// Keyword extraction configuration (requires keywords-yake or keywords-rake)
|
||||
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
||||
pub keywords: Option<KeywordConfig>,
|
||||
|
||||
/// Post-processor configuration (None = use defaults)
|
||||
pub postprocessor: Option<PostProcessorConfig>,
|
||||
|
||||
/// HTML to Markdown conversion options (requires html feature)
|
||||
#[cfg(feature = "html")]
|
||||
pub html_options: Option<ConversionOptions>,
|
||||
|
||||
/// Maximum concurrent extractions in batch (None = num_cpus * 1.5)
|
||||
pub max_concurrent_extractions: Option<usize>,
|
||||
|
||||
/// Result structure format (default: Unified)
|
||||
/// Uses types::OutputFormat (Unified | ElementBased)
|
||||
pub result_format: types::OutputFormat,
|
||||
|
||||
/// Security limits for archives (requires archives feature)
|
||||
#[cfg(feature = "archives")]
|
||||
pub security_limits: Option<SecurityLimits>,
|
||||
|
||||
/// Content output format (default: Plain)
|
||||
/// Uses config::OutputFormat (Plain | Markdown | Djot | Html)
|
||||
pub output_format: OutputFormat,
|
||||
}
|
||||
```
|
||||
|
||||
#### Creating Configs
|
||||
|
||||
```rust
|
||||
use kreuzberg::{ExtractionConfig, OcrConfig, ChunkingConfig, OutputFormat};
|
||||
|
||||
// Default configuration
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
// With OCR
|
||||
let config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// With chunking
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
output_format: OutputFormat::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Output Formats
|
||||
|
||||
There are two separate enums both named `OutputFormat` in different modules:
|
||||
|
||||
### Content `OutputFormat` (`core::config::formats::OutputFormat`)
|
||||
|
||||
Controls the format of the `content` field text. Used by `ExtractionConfig::output_format`.
|
||||
|
||||
```rust
|
||||
pub enum OutputFormat {
|
||||
/// Plain text (default)
|
||||
Plain,
|
||||
/// Markdown formatted
|
||||
Markdown,
|
||||
/// Djot markup format
|
||||
Djot,
|
||||
/// HTML format
|
||||
Html,
|
||||
}
|
||||
```
|
||||
|
||||
### Result `OutputFormat` (`types::extraction::OutputFormat`)
|
||||
|
||||
Controls the result structure. Used by `ExtractionConfig::result_format`.
|
||||
|
||||
```rust
|
||||
pub enum OutputFormat {
|
||||
/// Unified format with all content in `content` field (default)
|
||||
Unified,
|
||||
/// Element-based format with semantic element extraction
|
||||
ElementBased,
|
||||
}
|
||||
```
|
||||
|
||||
```rust
|
||||
use kreuzberg::{ExtractionConfig, OutputFormat};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown, // content format (Plain/Markdown/Djot/Html)
|
||||
// result_format uses types::OutputFormat (Unified/ElementBased) — defaults to Unified
|
||||
..Default::default()
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Extraction Result
|
||||
|
||||
### `ExtractionResult`
|
||||
|
||||
Result returned by all extraction functions.
|
||||
|
||||
```rust
|
||||
pub struct ExtractionResult {
|
||||
/// Main extracted content
|
||||
pub content: String,
|
||||
|
||||
/// Document MIME type
|
||||
pub mime_type: Cow<'static, str>,
|
||||
|
||||
/// Metadata about extraction
|
||||
pub metadata: Metadata,
|
||||
|
||||
/// Extracted tables (HTML/Markdown)
|
||||
pub tables: Vec<Table>,
|
||||
|
||||
/// Detected languages (if language-detection enabled)
|
||||
pub detected_languages: Option<Vec<String>>,
|
||||
|
||||
/// Text chunks (if chunking enabled)
|
||||
pub chunks: Option<Vec<Chunk>>,
|
||||
|
||||
/// Extracted images (if image extraction enabled)
|
||||
pub images: Option<Vec<ExtractedImage>>,
|
||||
|
||||
/// Per-page content (if page extraction enabled)
|
||||
pub pages: Option<Vec<PageContent>>,
|
||||
|
||||
/// Semantic elements (if element-based format enabled)
|
||||
pub elements: Option<Vec<Element>>,
|
||||
|
||||
/// Djot document structure (if extracting Djot)
|
||||
pub djot_content: Option<DjotContent>,
|
||||
|
||||
/// Extracted keywords with relevance scores (if keyword extraction enabled)
|
||||
pub extracted_keywords: Option<Vec<ExtractedKeyword>>,
|
||||
|
||||
/// Quality score for extraction result (0.0-1.0)
|
||||
pub quality_score: Option<f64>,
|
||||
|
||||
/// Non-fatal warnings during processing pipeline
|
||||
pub processing_warnings: Vec<ProcessingWarning>,
|
||||
}
|
||||
```
|
||||
|
||||
### `ExtractedKeyword`
|
||||
|
||||
Extracted keyword with relevance score and position information.
|
||||
|
||||
```rust
|
||||
pub struct ExtractedKeyword {
|
||||
/// Keyword text
|
||||
pub text: String,
|
||||
|
||||
/// Relevance score (0.0-1.0)
|
||||
pub score: f32,
|
||||
|
||||
/// Algorithm used for extraction ("tfidf", "textrank", "yake", etc.)
|
||||
pub algorithm: String,
|
||||
|
||||
/// Character positions in content (if available)
|
||||
pub positions: Option<Vec<usize>>,
|
||||
}
|
||||
```
|
||||
|
||||
### `ProcessingWarning`
|
||||
|
||||
Non-fatal warning encountered during document processing.
|
||||
|
||||
```rust
|
||||
pub struct ProcessingWarning {
|
||||
/// Component that generated the warning
|
||||
pub source: String,
|
||||
|
||||
/// Warning message describing the issue
|
||||
pub message: String,
|
||||
}
|
||||
```
|
||||
|
||||
### `Chunk`
|
||||
|
||||
Text chunk with optional embedding.
|
||||
|
||||
```rust
|
||||
pub struct Chunk {
|
||||
/// Chunk text content
|
||||
pub content: String,
|
||||
|
||||
/// Optional embedding vector
|
||||
pub embedding: Option<Vec<f32>>,
|
||||
|
||||
/// Chunk metadata
|
||||
pub metadata: ChunkMetadata,
|
||||
}
|
||||
|
||||
pub struct ChunkMetadata {
|
||||
pub byte_start: usize,
|
||||
pub byte_end: usize,
|
||||
pub token_count: Option<usize>,
|
||||
pub chunk_index: usize,
|
||||
pub total_chunks: usize,
|
||||
pub first_page: Option<usize>,
|
||||
pub last_page: Option<usize>,
|
||||
}
|
||||
```
|
||||
|
||||
### `ExtractedImage`
|
||||
|
||||
Image extracted from document.
|
||||
|
||||
```rust
|
||||
pub struct ExtractedImage {
|
||||
/// Raw image bytes
|
||||
pub data: Bytes,
|
||||
|
||||
/// Format: "jpeg", "png", "webp", etc.
|
||||
pub format: Cow<'static, str>,
|
||||
|
||||
/// Zero-indexed position
|
||||
pub image_index: usize,
|
||||
|
||||
/// Page number (1-indexed)
|
||||
pub page_number: Option<usize>,
|
||||
|
||||
/// Image dimensions
|
||||
pub width: Option<u32>,
|
||||
pub height: Option<u32>,
|
||||
|
||||
/// Colorspace: "RGB", "CMYK", "Gray"
|
||||
pub colorspace: Option<String>,
|
||||
|
||||
/// Bits per component
|
||||
pub bits_per_component: Option<u32>,
|
||||
|
||||
/// Whether this is a mask image
|
||||
pub is_mask: bool,
|
||||
|
||||
/// Image description
|
||||
pub description: Option<String>,
|
||||
|
||||
/// Nested OCR result (if OCRed)
|
||||
pub ocr_result: Option<Box<ExtractionResult>>,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
### `KreuzbergError` enum
|
||||
|
||||
```rust
|
||||
pub enum KreuzbergError {
|
||||
/// File system errors (always bubble up)
|
||||
Io(std::io::Error),
|
||||
|
||||
/// Document parsing errors
|
||||
Parsing {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// OCR processing errors
|
||||
Ocr {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// Configuration/input validation errors
|
||||
Validation {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// Cache operation errors
|
||||
Cache {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// Image processing errors
|
||||
ImageProcessing {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// Serialization errors (JSON, MessagePack)
|
||||
Serialization {
|
||||
message: String,
|
||||
source: Option<Box<dyn std::error::Error + Send + Sync>>,
|
||||
},
|
||||
|
||||
/// Missing system dependency (e.g. Tesseract)
|
||||
MissingDependency(String),
|
||||
|
||||
/// Plugin-specific errors
|
||||
Plugin {
|
||||
message: String,
|
||||
plugin_name: String,
|
||||
},
|
||||
|
||||
/// Mutex/RwLock poisoning
|
||||
LockPoisoned(String),
|
||||
|
||||
/// Unsupported MIME type or format
|
||||
UnsupportedFormat(String),
|
||||
|
||||
/// Other errors
|
||||
Other(String),
|
||||
}
|
||||
```
|
||||
|
||||
#### Error Constructors
|
||||
|
||||
```rust
|
||||
use kreuzberg::KreuzbergError;
|
||||
|
||||
// Create errors
|
||||
let err = KreuzbergError::parsing("invalid PDF");
|
||||
let err = KreuzbergError::ocr("Tesseract failed");
|
||||
let err = KreuzbergError::validation("config invalid");
|
||||
let err = KreuzbergError::unsupported_format("application/unknown");
|
||||
let err = KreuzbergError::missing_dependency("tesseract");
|
||||
|
||||
// With source
|
||||
let source = std::io::Error::new(std::io::ErrorKind::NotFound, "file missing");
|
||||
let err = KreuzbergError::parsing_with_source("corrupt PDF", source);
|
||||
```
|
||||
|
||||
#### Handling Errors
|
||||
|
||||
```rust
|
||||
use kreuzberg::extract_file;
|
||||
|
||||
match extract_file("doc.pdf", None, &config).await {
|
||||
Ok(result) => println!("Success: {}", result.content),
|
||||
Err(kreuzberg::KreuzbergError::Io(e)) => {
|
||||
println!("File error: {}", e);
|
||||
}
|
||||
Err(kreuzberg::KreuzbergError::UnsupportedFormat(fmt)) => {
|
||||
println!("Unsupported: {}", fmt);
|
||||
}
|
||||
Err(e) => println!("Other error: {}", e),
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## MIME Type Detection
|
||||
|
||||
### `detect_mime_type`
|
||||
|
||||
Detect MIME type from file path.
|
||||
|
||||
```rust
|
||||
pub fn detect_mime_type(path: impl AsRef<Path>) -> Result<String>
|
||||
```
|
||||
|
||||
```rust
|
||||
use kreuzberg::detect_mime_type;
|
||||
|
||||
let mime = detect_mime_type("document.pdf")?;
|
||||
assert_eq!(mime, "application/pdf");
|
||||
```
|
||||
|
||||
### `detect_mime_type_from_bytes`
|
||||
|
||||
Detect MIME type from byte data.
|
||||
|
||||
```rust
|
||||
pub fn detect_mime_type_from_bytes(data: &[u8]) -> Result<String>
|
||||
```
|
||||
|
||||
### `validate_mime_type`
|
||||
|
||||
Check if a MIME type is supported.
|
||||
|
||||
```rust
|
||||
pub fn validate_mime_type(mime_type: &str) -> Result<()>
|
||||
```
|
||||
|
||||
```rust
|
||||
use kreuzberg::validate_mime_type;
|
||||
|
||||
validate_mime_type("application/pdf")?; // OK
|
||||
validate_mime_type("application/unknown")?; // Error
|
||||
```
|
||||
|
||||
### `get_extensions_for_mime`
|
||||
|
||||
Get file extensions for a MIME type.
|
||||
|
||||
```rust
|
||||
pub fn get_extensions_for_mime(mime_type: &str) -> Vec<String>
|
||||
```
|
||||
|
||||
```rust
|
||||
use kreuzberg::get_extensions_for_mime;
|
||||
|
||||
let exts = get_extensions_for_mime("application/pdf");
|
||||
// ["pdf"]
|
||||
|
||||
let exts = get_extensions_for_mime("text/plain");
|
||||
// ["txt", "text"]
|
||||
```
|
||||
|
||||
### MIME Type Constants
|
||||
|
||||
```rust
|
||||
use kreuzberg::{
|
||||
PDF_MIME_TYPE,
|
||||
PLAIN_TEXT_MIME_TYPE,
|
||||
HTML_MIME_TYPE,
|
||||
MARKDOWN_MIME_TYPE,
|
||||
JSON_MIME_TYPE,
|
||||
XML_MIME_TYPE,
|
||||
DOCX_MIME_TYPE,
|
||||
POWER_POINT_MIME_TYPE,
|
||||
EXCEL_MIME_TYPE,
|
||||
};
|
||||
|
||||
assert_eq!(PDF_MIME_TYPE, "application/pdf");
|
||||
assert_eq!(PLAIN_TEXT_MIME_TYPE, "text/plain");
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Plugin Registry
|
||||
|
||||
Access extractors, OCR backends, and validators.
|
||||
|
||||
### `get_document_extractor_registry`
|
||||
|
||||
Get all available document extractors.
|
||||
|
||||
```rust
|
||||
pub fn get_document_extractor_registry() -> Arc<RwLock<DocumentExtractorRegistry>>
|
||||
```
|
||||
|
||||
### `get_ocr_backend_registry`
|
||||
|
||||
Get all available OCR backends.
|
||||
|
||||
```rust
|
||||
pub fn get_ocr_backend_registry() -> Arc<RwLock<OcrBackendRegistry>>
|
||||
```
|
||||
|
||||
### `get_post_processor_registry`
|
||||
|
||||
Get all available post-processors.
|
||||
|
||||
```rust
|
||||
pub fn get_post_processor_registry() -> Arc<RwLock<PostProcessorRegistry>>
|
||||
```
|
||||
|
||||
### `get_validator_registry`
|
||||
|
||||
Get all available validators.
|
||||
|
||||
```rust
|
||||
pub fn get_validator_registry() -> Arc<RwLock<ValidatorRegistry>>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Example
|
||||
|
||||
```rust
|
||||
use kreuzberg::{
|
||||
extract_file, ExtractionConfig, OutputFormat,
|
||||
ChunkingConfig, OcrConfig, LanguageDetectionConfig,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> kreuzberg::Result<()> {
|
||||
// Configure extraction
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown,
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
language_detection: Some(LanguageDetectionConfig::default()),
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Extract from file
|
||||
let result = extract_file("document.pdf", None, &config).await?;
|
||||
|
||||
// Use results
|
||||
println!("Content:\n{}", result.content);
|
||||
println!("MIME: {}", result.mime_type);
|
||||
|
||||
if let Some(langs) = result.detected_languages {
|
||||
println!("Languages: {:?}", langs);
|
||||
}
|
||||
|
||||
if let Some(chunks) = result.chunks {
|
||||
println!("Chunks: {}", chunks.len());
|
||||
for chunk in chunks {
|
||||
println!(" - {}", &chunk.content[..50.min(chunk.content.len())]);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(images) = result.images {
|
||||
println!("Images: {}", images.len());
|
||||
}
|
||||
|
||||
if let Some(pages) = result.pages {
|
||||
println!("Pages: {}", pages.len());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Result Type Alias
|
||||
|
||||
```rust
|
||||
pub type Result<T> = std::result::Result<T, KreuzbergError>;
|
||||
```
|
||||
|
||||
All fallible operations return `Result<T>` where errors are `KreuzbergError`.
|
||||
|
||||
---
|
||||
|
||||
## Feature Flags Summary
|
||||
|
||||
| Feature | Availability | Dependencies |
|
||||
| ------------------ | ------------ | ---------------------------------------------- |
|
||||
| tokio-runtime | Default | Tokio runtime for async/sync |
|
||||
| pdf | Default | PDFium |
|
||||
| ocr | Optional | Tesseract |
|
||||
| chunking | Optional | text-splitter |
|
||||
| embeddings | Optional | FastEmbed, requires tokio-runtime |
|
||||
| language-detection | Optional | whatlang |
|
||||
| keywords-yake | Optional | yake-rust |
|
||||
| keywords-rake | Optional | rake |
|
||||
| api | Optional | Axum, requires tokio-runtime |
|
||||
| mcp | Optional | Model Context Protocol, requires tokio-runtime |
|
||||
|
||||
---
|
||||
|
||||
## Version
|
||||
|
||||
This reference is for Kreuzberg 4.x.
|
||||
223
skills/kreuzberg/references/supported-formats.md
Normal file
223
skills/kreuzberg/references/supported-formats.md
Normal file
@@ -0,0 +1,223 @@
|
||||
# Supported Formats Reference
|
||||
|
||||
Kreuzberg supports 91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction. All formats support text and metadata extraction. Additional capabilities like OCR and table detection are noted per format.
|
||||
|
||||
## Office Documents
|
||||
|
||||
### Word Processing
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------------------ | ------------------------ | ------------------------------------------------------------------------- | --------------------------------------------------------------- |
|
||||
| Microsoft Word | `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` | Full text extraction, tables, embedded images, metadata, styles |
|
||||
| Word Macro-Enabled | `.docm` | `application/vnd.ms-word.document.macroEnabled.12` | Macro-enabled document extraction, metadata |
|
||||
| Word Template | `.dotx`, `.dotm`, `.dot` | Various Word template MIME types | Template document extraction, metadata |
|
||||
| OpenDocument Text | `.odt` | `application/vnd.oasis.opendocument.text` | Full text extraction, tables, embedded images, metadata, styles |
|
||||
|
||||
### Spreadsheets
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------------------------ | ---------- | ---------------------------------------------------------------------- | -------------------------------------------------------- |
|
||||
| Excel Workbook | `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` | Sheet data, cell values, formulas, cell metadata, charts |
|
||||
| Excel Macro-Enabled | `.xlsm` | `application/vnd.ms-excel.sheet.macroEnabled.12` | Sheet data, formulas, macros (text only), metadata |
|
||||
| Excel Binary | `.xlsb` | `application/vnd.ms-excel.sheet.binary.macroEnabled.12` | Binary sheet data extraction, metadata |
|
||||
| Excel Legacy | `.xls` | `application/vnd.ms-excel` | Legacy sheet data extraction, metadata |
|
||||
| Excel Add-in | `.xla` | `application/vnd.ms-excel` | Add-in data extraction |
|
||||
| Excel Macro Add-in | `.xlam` | `application/vnd.ms-excel.addin.macroEnabled.12` | Macro add-in metadata |
|
||||
| Excel Template | `.xltm` | `application/vnd.ms-excel.template.macroEnabled.12` | Template data and metadata |
|
||||
| Excel Template (XML) | `.xltx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.template` | XML template data and metadata |
|
||||
| Excel Template (Legacy) | `.xlt` | `application/vnd.ms-excel` | Legacy template data extraction |
|
||||
| OpenDocument Spreadsheet | `.ods` | `application/vnd.oasis.opendocument.spreadsheet` | Sheet data, cell values, formulas, metadata |
|
||||
|
||||
### Presentations
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------------------- | ------------------------ | --------------------------------------------------------------------------- | ---------------------------------------------------- |
|
||||
| PowerPoint Presentation | `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` | Slide text, speaker notes, embedded images, metadata |
|
||||
| PowerPoint Legacy | `.ppt` | `application/vnd.ms-powerpoint` | Legacy slide text extraction, metadata |
|
||||
| PowerPoint Slideshow | `.ppsx` | `application/vnd.openxmlformats-officedocument.presentationml.slideshow` | Slideshow content, speaker notes, metadata |
|
||||
| PowerPoint Template | `.potx`, `.potm`, `.pot` | Various PowerPoint template MIME types | Template slide extraction, metadata |
|
||||
|
||||
### PDF
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------------------------ | ---------- | ----------------- | -------------------------------------------------------------------------------------------------- |
|
||||
| Portable Document Format | `.pdf` | `application/pdf` | Text extraction, tables, embedded images, metadata, OCR (when needed), password protection support |
|
||||
|
||||
### eBooks
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------- | ---------- | ------------------------------- | ------------------------------------------------------ |
|
||||
| EPUB | `.epub` | `application/epub+zip` | Chapter text, metadata, embedded resources, navigation |
|
||||
| FictionBook | `.fb2` | `application/x-fictionbook+xml` | Book content, metadata, chapter structure |
|
||||
|
||||
### Database
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------ | ---------- | ------------------- | ----------------------------------------------------- |
|
||||
| dBASE | `.dbf` | `application/x-dbf` | Table data extraction as markdown, field type support |
|
||||
|
||||
### Hangul
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| --------------------- | --------------- | ----------------------------------------------- | --------------------------------------- |
|
||||
| Hangul Word Processor | `.hwp`, `.hwpx` | `application/x-hwp`, `application/haansofthwpx` | Korean document format, text extraction |
|
||||
|
||||
## Images (OCR-Enabled)
|
||||
|
||||
### Raster Images
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------ | --------------- | ------------ | ---------------------------------------------------------------------------- |
|
||||
| PNG | `.png` | `image/png` | OCR text extraction, table detection, EXIF metadata, dimensions, color space |
|
||||
| JPEG | `.jpg`, `.jpeg` | `image/jpeg` | OCR text extraction, table detection, EXIF metadata, color profile |
|
||||
| GIF | `.gif` | `image/gif` | OCR text extraction, animation metadata, dimensions |
|
||||
| WebP | `.webp` | `image/webp` | OCR text extraction, metadata, lossy/lossless detection |
|
||||
| Bitmap | `.bmp` | `image/bmp` | OCR text extraction, dimensions, color depth |
|
||||
| TIFF | `.tiff`, `.tif` | `image/tiff` | OCR text extraction, multi-page support, EXIF metadata, compression info |
|
||||
|
||||
### Advanced Image Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------------------ | ------------------------------ | ------------------------- | -------------------------------------------------------------------------------- |
|
||||
| JPEG 2000 | `.jp2` | `image/jp2` | OCR via pure Rust decoder (hayro-jpeg2000), table detection, resolution metadata |
|
||||
| JPEG 2000 Extended | `.jpx` | `image/jpx` | Advanced JPEG 2000 features, high-resolution content, metadata |
|
||||
| JPEG 2000 Compound | `.jpm` | `image/jpm` | Compound image support, mixed content |
|
||||
| Motion JPEG 2000 | `.mj2` | `video/mj2` | JPEG 2000 video/sequence metadata |
|
||||
| JBIG2 | `.jbig2`, `.jb2` | `image/jbig2` | Bi-level image OCR, high compression, technical documents |
|
||||
| Portable PixMap | `.pnm`, `.pbm`, `.pgm`, `.ppm` | `image/x-portable-pixmap` | OCR for plain image formats, raw pixel data |
|
||||
|
||||
### Vector Graphics
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------------------------ | ---------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| Scalable Vector Graphics | `.svg` | `image/svg+xml` | DOM parsing, embedded text extraction, graphics metadata, vector elements |
|
||||
|
||||
## Web & Data
|
||||
|
||||
### Markup & Structured Text
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ---------------- | --------------- | ----------------------- | ---------------------------------------------------------------------------------- |
|
||||
| HyperText Markup | `.html`, `.htm` | `text/html` | DOM parsing, text extraction, metadata (Open Graph, Twitter Card), link extraction |
|
||||
| XHTML | `.xhtml` | `application/xhtml+xml` | XHTML parsing, metadata extraction, semantic structure |
|
||||
| XML | `.xml` | `application/xml` | DOM parsing, namespace handling, text extraction, structure analysis |
|
||||
|
||||
### Structured Data Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ------ | --------------- | --------------------------- | ---------------------------------------------------------- |
|
||||
| JSON | `.json` | `application/json` | Schema detection, nested structure parsing, validation |
|
||||
| YAML | `.yaml`, `.yml` | `application/x-yaml` | Hierarchical data parsing, custom tags, nested structures |
|
||||
| TOML | `.toml` | `application/toml` | Configuration parsing, table structures, type preservation |
|
||||
| CSV | `.csv` | `text/csv` | Delimiter detection, header inference, type detection |
|
||||
| TSV | `.tsv` | `text/tab-separated-values` | Tab-separated value parsing, header detection |
|
||||
|
||||
### Text & Markup Languages
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ---------------- | ------------------ | ----------------- | ------------------------------------------------- |
|
||||
| Plain Text | `.txt` | `text/plain` | Raw text extraction, encoding detection |
|
||||
| Markdown | `.md`, `.markdown` | `text/markdown` | CommonMark parsing, GFM extensions, front matter |
|
||||
| Djot | `.djot` | `text/djot` | Djot format parsing, semantic structure |
|
||||
| reStructuredText | `.rst` | `text/x-rst` | RST parsing, directive handling, role extraction |
|
||||
| Org Mode | `.org` | `text/org` | Org mode structure, outline parsing, metadata |
|
||||
| Rich Text Format | `.rtf` | `application/rtf` | Text with formatting extraction, font information |
|
||||
|
||||
## Email & Archives
|
||||
|
||||
### Email Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------------- | ---------- | ---------------------------- | -------------------------------------------------------------------------------------- |
|
||||
| Email Message | `.eml` | `message/rfc822` | Headers (from, to, subject, date), body (HTML/plain text), attachments, threading info |
|
||||
| Microsoft Outlook | `.msg` | `application/vnd.ms-outlook` | Outlook headers, body content, attachments, recipient metadata |
|
||||
|
||||
### Archive Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------- | ---------- | ----------------------------- | ---------------------------------------------------------- |
|
||||
| ZIP Archive | `.zip` | `application/zip` | File listing, nested archive support, compression metadata |
|
||||
| Tar Archive | `.tar` | `application/x-tar` | File listing, permission metadata, nested archives |
|
||||
| Gzip Tar | `.tgz` | `application/gzip` | Compressed archive listing, metadata |
|
||||
| Gzip | `.gz` | `application/gzip` | Compressed file metadata |
|
||||
| 7-Zip | `.7z` | `application/x-7z-compressed` | File listing, compression info, nested archives |
|
||||
|
||||
## Academic & Scientific
|
||||
|
||||
### Citation Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------------------- | ----------- | ---------------------------------------- | ------------------------------------------------- |
|
||||
| BibTeX | `.bib` | `text/bibtex` | Structured parsing, entry types, field extraction |
|
||||
| BibLaTeX | `.biblatex` | `text/bibtex` | Extended BibTeX format, advanced field support |
|
||||
| RIS | `.ris` | `application/x-research-info-systems` | Structured RIS format parsing, type detection |
|
||||
| NIH RIS | `.nbib` | `application/x-research-info-systems` | NIH/PubMed format, structured citation data |
|
||||
| EndNote | `.enw` | `application/x-endnote` | EndNote XML format, citation metadata |
|
||||
| Citation Style Language | `.csl` | `application/vnd.citationstyles.csl+xml` | CSL JSON/XML parsing, style definitions |
|
||||
|
||||
### Scientific & Technical Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ---------------- | ---------------- | -------------------------- | ----------------------------------------------------------- |
|
||||
| LaTeX | `.tex`, `.latex` | `application/x-latex` | LaTeX source parsing, commands, document structure |
|
||||
| Typst | `.typ` | `text/plain` | Typst markup parsing, document structure |
|
||||
| JATS XML | `.jats` | `application/xml` | PubMed JATS parsing, article structure, metadata |
|
||||
| Jupyter Notebook | `.ipynb` | `application/x-ipynb+json` | Cell extraction (code + markdown), output parsing, metadata |
|
||||
| DocBook | `.docbook` | `application/docbook+xml` | DocBook XML parsing, semantic structure |
|
||||
|
||||
### Documentation Formats
|
||||
|
||||
| Format | Extensions | MIME Type | Capabilities |
|
||||
| ----------- | ---------- | ------------------------ | ----------------------------------------------- |
|
||||
| OPML | `.opml` | `application/x-opml+xml` | Outline parsing, hierarchy extraction, metadata |
|
||||
| Perl POD | `.pod` | `text/x-pod` | Perl documentation parsing, section extraction |
|
||||
| Manual Page | `.mdoc` | `text/plain` | UNIX manual page parsing, section structure |
|
||||
| Troff/Groff | `.troff` | `text/troff` | Typesetting markup parsing, document structure |
|
||||
|
||||
## Format Capabilities Summary
|
||||
|
||||
### Text Extraction
|
||||
|
||||
All 85+ formats support full or partial text extraction. Document structure and encoding are automatically detected.
|
||||
|
||||
### Metadata Support
|
||||
|
||||
Comprehensive metadata extraction includes:
|
||||
|
||||
- Document properties (title, author, subject, creation date, modification date)
|
||||
- Format-specific metadata (page count, dimensions, encoding, language)
|
||||
- EXIF data (for images)
|
||||
- Document statistics (word count, character count)
|
||||
|
||||
### OCR (Optical Character Recognition)
|
||||
|
||||
OCR is available for image formats:
|
||||
|
||||
- **Raster Images**: PNG, JPEG, GIF, WebP, BMP, TIFF
|
||||
- **Advanced Formats**: JPEG 2000, JBIG2, PNM/PBM/PGM/PPM
|
||||
- **Configurable Backends**: Tesseract (all languages), EasyOCR, PaddleOCR (Python), Guten (Node.js)
|
||||
|
||||
### Table Detection
|
||||
|
||||
Smart table detection and reconstruction available for:
|
||||
|
||||
- PDF documents (native tables and scanned content with OCR)
|
||||
- Office documents (Excel, Word)
|
||||
- Images (via OCR backends)
|
||||
- HTML/XML (from markup structure)
|
||||
|
||||
### Archive & Nested Document Support
|
||||
|
||||
Archives and nested formats support file listing and sequential extraction:
|
||||
|
||||
- ZIP, TAR, TGZ, 7Z archives
|
||||
- Email attachments
|
||||
- Nested archives within archives
|
||||
|
||||
## Getting Started
|
||||
|
||||
For language-specific examples and detailed API documentation, see the [API Reference](https://docs.kreuzberg.dev/reference/api-python/).
|
||||
|
||||
For OCR configuration and backend selection, see the [OCR Backends Guide](https://docs.kreuzberg.dev/guides/ocr/).
|
||||
|
||||
For comprehensive format details and format detection, see the [Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/).
|
||||
Reference in New Issue
Block a user