Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
TokenReductionConfig,
LanguageDetectionConfig,
)
config = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
token_reduction=TokenReductionConfig(mode="light"),
language_detection=LanguageDetectionConfig(
enabled=True, detect_multiple=True
),
use_cache=True,
enable_quality_processing=True,
)
result = extract_file_sync("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Chunk: {chunk.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
config = ExtractionConfig(
chunking=ChunkingConfig(max_characters=500, overlap=50),
)
result = extract_file_sync("document.pdf", config=config)
if result.chunks:
for chunk in result.chunks:
first = chunk.metadata.first_page
last = chunk.metadata.last_page
if first is None:
continue
page_range = f"Page {first}" if first == last else f"Pages {first}-{last}"
print(f"Chunk: {chunk.content[:50]}... ({page_range})")
```

View File

@@ -0,0 +1,59 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=1000,
overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
```
```python title="Python - Semantic"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(chunker_type="semantic")
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Prepend Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_characters=500,
overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,39 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=500,
overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
),
)
)
result = await extract_file("research_paper.pdf", config=config)
for chunk in result.chunks or []:
print(
f"Chunk {chunk.metadata.chunk_index + 1}/{chunk.metadata.total_chunks}"
)
print(
f"Position: {chunk.metadata.byte_start}-{chunk.metadata.byte_end}"
)
print(f"Content: {chunk.content[:100]}...")
if chunk.embedding is not None:
print(f"Embedding: {len(chunk.embedding)} dimensions")
asyncio.run(main())
```

View File

@@ -0,0 +1,34 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
LanguageDetectionConfig,
TokenReductionConfig,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
language_detection=LanguageDetectionConfig(enabled=True),
token_reduction=TokenReductionConfig(mode="moderate"),
chunking=ChunkingConfig(
max_chars=512,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"), normalize=True
),
),
)
result = await extract_file("document.pdf", config=config)
quality = result.quality_score or 0
print(f"Quality: {quality:.2f}")
print(f"Languages: {result.detected_languages}")
if result.chunks:
print(f"Chunks: {len(result.chunks)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,36 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
OcrConfig,
TesseractConfig,
PdfConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
enable_quality_processing=True,
ocr=OcrConfig(
backend="tesseract",
language="eng+fra",
tesseract_config=TesseractConfig(psm=3),
),
pdf_options=PdfConfig(extract_images=True),
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
),
),
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=1024,
overlap=100,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
batch_size=32,
show_download_progress=False,
),
)
)
```

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import ExtractionConfig, KeywordConfig, KeywordAlgorithm
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=[1, 3],
language="en",
)
)
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
)
)
result = await extract_file("research_paper.pdf", config=config)
for keyword in result.extracted_keywords or []:
print(f"{keyword.text}: {keyword.score:.3f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=False,
)
)
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=True,
)
)
result = await extract_file("multilingual_document.pdf", config=config)
print(f"Detected languages: {result.detected_languages}")
asyncio.run(main())
```

View File

@@ -0,0 +1,7 @@
```python title="Python"
from kreuzberg import ExtractionConfig
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
)
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
)
result = await extract_file("scanned_document.pdf", config=config)
if result.quality_score is not None:
if result.quality_score < 0.5:
print(f"Warning: Low quality extraction ({result.quality_score:.2f})")
else:
print(f"Quality score: {result.quality_score:.2f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
from kreuzberg import ExtractionConfig, TokenReductionConfig, ReductionLevel
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
level=ReductionLevel.MODERATE,
preserve_markdown=True,
preserve_code=True,
language_hint="eng",
)
)
```

View File

@@ -0,0 +1,25 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
TokenReductionConfig,
ReductionLevel,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
level=ReductionLevel.MODERATE,
preserve_markdown=True,
)
)
result = await extract_file("verbose_document.pdf", config=config)
print(f"Reduced content length: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,59 @@
```python title="Python"
import asyncio
from dataclasses import dataclass, field
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
extract_file,
)
@dataclass
class VectorRecord:
id: str
content: str
embedding: list[float]
metadata: dict[str, str] = field(default_factory=dict)
async def extract_and_vectorize(
document_path: str,
document_id: str,
) -> list[VectorRecord]:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=512,
overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
batch_size=32,
),
)
)
result = await extract_file(document_path, config=config)
records: list[VectorRecord] = []
for index, chunk in enumerate(result.chunks or []):
if chunk.embedding is None:
continue
records.append(
VectorRecord(
id=f"{document_id}_chunk_{index}",
content=chunk.content,
embedding=chunk.embedding,
metadata={
"document_id": document_id,
"chunk_index": str(index),
"content_length": str(len(chunk.content)),
},
)
)
return records
asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
```

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig
items = [
BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
]
results = batch_extract_bytes_sync(items, ExtractionConfig())
for i, result in enumerate(results):
print(f"Item {i}: {len(result.content)} chars extracted")
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
items = [
BatchFileItem(path="doc1.pdf"),
BatchFileItem(path="doc2.docx"),
BatchFileItem(path="doc3.html"),
]
results = batch_extract_files_sync(items, ExtractionConfig())
for i, result in enumerate(results):
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
```

View File

@@ -0,0 +1,29 @@
```python title="Python"
import asyncio
import httpx
async def main() -> None:
payload = {
"text": "Your long text content here...",
"chunker_type": "text",
"config": {
"max_characters": 1000,
"overlap": 50,
"trim": True,
},
}
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/chunk", json=payload)
result = response.json()
print(f"Created {result['chunk_count']} chunks")
for chunk in result["chunks"]:
preview = chunk["content"][:50]
print(f"Chunk {chunk['chunk_index']}: {preview}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
import json
import httpx
async def main() -> None:
async with httpx.AsyncClient() as client, open("document.pdf", "rb") as f:
response = await client.post(
"http://localhost:8000/extract",
files={"files": f},
)
data = response.json()
print(json.dumps(data, indent=2))
asyncio.run(main())
```

View File

@@ -0,0 +1,52 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
OcrConfig,
ChunkingConfig,
ChunkerType,
ImageExtractionConfig,
OutputFormat,
extract_file_sync,
)
config = ExtractionConfig(
# OCR: extract text from images, fallback to Tesseract
ocr=OcrConfig(
enabled=True,
backend="tesseract",
language="eng",
),
# Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
chunking=ChunkingConfig(
max_characters=800,
overlap=100,
chunker_type=ChunkerType.Markdown,
prepend_heading_context=True,
),
# Output: Markdown format with document structure preserved
output_format=OutputFormat.Markdown,
include_document_structure=True,
# Images: extract embedded images
images=ImageExtractionConfig(
extract_images=True,
),
# Cache extracted results on disk
use_cache=True,
)
result = extract_file_sync("report.pdf", config=config)
print(f"Content ({len(result.content)} chars):")
print(result.content[:200])
if result.chunks:
print(f"\nChunks: {len(result.chunks)}")
print(f"Tables: {len(result.tables)}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
if result.extraction_method:
print(f"Extraction method: {result.extraction_method}")
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, KreuzbergError
config = ExtractionConfig()
try:
result = extract_file_sync("missing.pdf", config=config)
except KreuzbergError as e:
print(f"Extraction failed: {e}")
raise
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
from kreuzberg import (
batch_extract_files_sync,
BatchFileItem,
ExtractionConfig,
KreuzbergError,
)
items = [
BatchFileItem(path="doc1.pdf"),
BatchFileItem(path="doc2.docx"),
BatchFileItem(path="missing.html"),
]
config = ExtractionConfig()
try:
results = batch_extract_files_sync(items, config=config)
for i, result in enumerate(results):
if result.metadata.error:
print(f"Document {i}: ERROR - {result.metadata.error}")
else:
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
except KreuzbergError as e:
print(f"Batch extraction failed: {e}")
raise
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import asyncio
from kreuzberg import extract_bytes, ExtractionConfig
async def main() -> None:
with open("document.pdf", "rb") as f:
content = f.read()
result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import extract_bytes_sync, ExtractionConfig
with open("document.pdf", "rb") as f:
content = f.read()
result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
result = await extract_file("document.pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
asyncio.run(main())
```

View File

@@ -0,0 +1,9 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
```

View File

@@ -0,0 +1,51 @@
```python title="simple_benchmark.py"
import time
import asyncio
from pathlib import Path
from kreuzberg import Kreuzberg, ExtractionConfig
async def benchmark_extractions():
config = ExtractionConfig(use_cache=False)
kreuzberg = Kreuzberg(config)
file_path = "document.pdf"
num_runs = 10
start = time.perf_counter()
for _ in range(num_runs):
result = kreuzberg.extract_file(file_path)
sync_duration = time.perf_counter() - start
avg_sync = sync_duration / num_runs
print(f"Sync extraction ({num_runs} runs):")
print(f" - Total time: {sync_duration:.3f}s")
print(f" - Average: {avg_sync:.3f}s per extraction")
start = time.perf_counter()
tasks = [kreuzberg.extract_file_async(file_path) for _ in range(num_runs)]
await asyncio.gather(*tasks)
async_duration = time.perf_counter() - start
print(f"\nAsync extraction ({num_runs} parallel runs):")
print(f" - Total time: {async_duration:.3f}s")
print(f" - Average: {async_duration / num_runs:.3f}s per extraction")
print(f" - Speedup: {sync_duration / async_duration:.1f}x")
cache_config = ExtractionConfig(use_cache=True)
kreuzberg_cached = Kreuzberg(cache_config)
print("\nFirst extraction (populates cache)...")
start = time.perf_counter()
result1 = await kreuzberg_cached.extract_file_async(file_path)
first_duration = time.perf_counter() - start
print(f" - Time: {first_duration:.3f}s")
print("Second extraction (from cache)...")
start = time.perf_counter()
result2 = await kreuzberg_cached.extract_file_async(file_path)
cached_duration = time.perf_counter() - start
print(f" - Time: {cached_duration:.3f}s")
print(f" - Cache speedup: {first_duration / cached_duration:.1f}x")
if __name__ == "__main__":
asyncio.run(benchmark_extractions())
```

View File

@@ -0,0 +1,37 @@
```python title="disk_cache.py"
from pathlib import Path
from kreuzberg import Kreuzberg, ExtractionConfig, CacheConfig
cache_dir = Path.home() / ".cache" / "kreuzberg"
cache_dir.mkdir(parents=True, exist_ok=True)
config = ExtractionConfig(
use_cache=True,
cache_config=CacheConfig(
cache_path=str(cache_dir),
max_cache_size=500 * 1024 * 1024,
cache_ttl_seconds=7 * 86400,
enable_compression=True,
),
)
kreuzberg = Kreuzberg(config)
print("First extraction (will be cached)...")
result1 = kreuzberg.extract_file("document.pdf")
print(f" - Content length: {len(result1.content)}")
print(f" - Cached: {result1.metadata.get('was_cached', False)}")
print("\nSecond extraction (from cache)...")
result2 = kreuzberg.extract_file("document.pdf")
print(f" - Content length: {len(result2.content)}")
print(f" - Cached: {result2.metadata.get('was_cached', False)}")
print(f"\nResults are identical: {result1.content == result2.content}")
cache_stats = kreuzberg.get_cache_stats()
print(f"\nCache Statistics:")
print(f" - Total entries: {cache_stats.get('total_entries', 0)}")
print(f" - Cache size: {cache_stats.get('cache_size_bytes', 0) / 1024 / 1024:.1f} MB")
print(f" - Hit rate: {cache_stats.get('hit_rate', 0):.1%}")
```

View File

@@ -0,0 +1,28 @@
```python title="basic_cli.py"
import subprocess
import json
import sys
from pathlib import Path
def extract_with_cli(file_path: str, output_format: str = "text") -> str:
result = subprocess.run(
["kreuzberg", "extract", file_path, "--format", output_format],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"Error: {result.stderr}", file=sys.stderr)
sys.exit(1)
if output_format == "json":
return json.loads(result.stdout)
return result.stdout
document = "document.pdf"
text_output = extract_with_cli(document, "text")
print(f"Extracted: {len(text_output)} characters")
json_output = extract_with_cli(document, "json")
print(f"Format: {json_output.get('format', 'unknown')}")
```

View File

@@ -0,0 +1,29 @@
```python title="cli_with_config.py"
import subprocess
import json
import sys
from pathlib import Path
def extract_with_config(file_path: str, config_path: str) -> dict:
result = subprocess.run(
["kreuzberg", "extract", file_path, "--config", config_path, "--format", "json"],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"Error: {result.stderr}", file=sys.stderr)
sys.exit(1)
return json.loads(result.stdout)
config_file = Path("kreuzberg.toml")
document = "document.pdf"
print(f"Extracting {document} with config {config_file}")
result = extract_with_config(str(document), str(config_file))
print(f"Content length: {len(result.get('content', ''))}")
print(f"Format: {result.get('format')}")
print(f"Languages: {result.get('languages')}")
```

View File

@@ -0,0 +1,60 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
LanguageDetectionConfig,
TokenReductionOptions,
PostProcessorConfig,
KeywordConfig,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
enable_quality_processing=True,
ocr=OcrConfig(
backend="tesseract",
language="eng",
),
chunking=ChunkingConfig(
max_characters=1000,
overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
batch_size=32,
normalize=True,
),
),
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=False,
),
keywords=KeywordConfig(
algorithm="yake",
max_keywords=10,
min_score=0.1,
ngram_range=[1, 3],
language="en",
),
token_reduction=TokenReductionOptions(
mode="moderate",
preserve_important_words=True,
),
postprocessor=PostProcessorConfig(enabled=True),
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
if result.chunks:
print(f"Chunks: {len(result.chunks)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,80 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Chunks: {len(result.chunks or [])}")
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
```
```python title="Python - Markdown with Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
sizing_type="tokenizer",
sizing_model="Xenova/gpt-4o",
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
heading_context = chunk.metadata.get("heading_context")
if heading_context:
headings = heading_context.get("headings", [])
for h in headings:
print(f"Heading L{h['level']}: {h['text']}")
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Semantic"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(chunker_type="semantic")
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Prepend Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config = ExtractionConfig(
use_cache=True,
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract", language="eng+fra",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
)
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
ocr=OcrConfig(backend="tesseract", language="eng"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
)
result = extract_file_sync("document.pdf", config=config)
content_length: int = len(result.content)
print(f"Content length: {content_length}")
```

View File

@@ -0,0 +1,15 @@
```python title="Document Structure Config (Python)"
from kreuzberg import extract_file_sync, ExtractionConfig
# Enable document structure output
config = ExtractionConfig(include_document_structure=True)
result = extract_file_sync("document.pdf", config=config)
# Access the document tree
if result.document:
for node in result.document["nodes"]:
node_type = node["content"]["node_type"]
text = node["content"].get("text", "")
print(f"[{node_type}] {text[:80]}")
```

View File

@@ -0,0 +1,29 @@
```python title="Element-Based Output (Python)"
from kreuzberg import extract_file_sync, ExtractionConfig
# Configure element-based output
config = ExtractionConfig(result_format="element_based")
# Extract document
result = extract_file_sync("document.pdf", config=config)
# Access elements
for element in result.elements:
print(f"Type: {element.element_type}")
print(f"Text: {element.text[:100]}")
if element.metadata.page_number:
print(f"Page: {element.metadata.page_number}")
if element.metadata.coordinates:
coords = element.metadata.coordinates
print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
print("---")
# Filter by element type
titles = [e for e in result.elements if e.element_type == "title"]
for title in titles:
level = title.metadata.additional.get("level", "unknown")
print(f"[{level}] {title.text}")
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
config = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-mpnet-base-v2"),
batch_size=16,
normalize=True,
show_download_progress=True
)
)
)
```

View File

@@ -0,0 +1,83 @@
from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
# Example 1: Preset model (recommended)
# Fast, balanced, or quality preset configurations optimized for common use cases.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
batch_size=32,
normalize=True,
show_download_progress=True,
cache_dir="~/.cache/kreuzberg/embeddings",
)
# Available presets:
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
# - "quality" (1024 dims): Complex documents, maximum accuracy
# - "multilingual" (768 dims): International documents, 100+ languages
# Example 2: Custom ONNX model (requires embeddings feature)
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.custom(
model_id="BAAI/bge-small-en-v1.5",
dimensions=384,
),
batch_size=32,
normalize=True,
show_download_progress=True,
cache_dir=None, # Uses default: .kreuzberg/embeddings/
)
# Popular ONNX-compatible models:
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
# Example 3: Alternative Custom Model
# For advanced users wanting alternative ONNX embedding models.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.custom(
model_id="sentence-transformers/all-mpnet-base-v2",
dimensions=768,
),
batch_size=16, # Larger model requires smaller batch size
normalize=True,
show_download_progress=True,
cache_dir="/var/cache/embeddings",
)
# Integration with ChunkingConfig
# Add embeddings to your chunking configuration:
chunking_with_embeddings = ChunkingConfig(
max_chars=1024,
max_overlap=100,
preset="balanced",
embedding=EmbeddingConfig(), # Uses balanced preset
)
extraction_config = ExtractionConfig(
chunking=chunking_with_embeddings,
)
# Key parameter explanations:
#
# batch_size: Number of texts to embed at once (32-128 typical)
# - Larger batches are faster but use more memory
# - Smaller batches for resource-constrained environments
#
# normalize: Whether to normalize vectors (L2 norm)
# - True (recommended): Enables cosine similarity in vector DBs
# - False: Raw embedding values
#
# cache_dir: Where to store downloaded models
# - None: Uses .kreuzberg/embeddings/ in current directory
# - String path: Custom directory for model storage
#
# show_download_progress: Display download progress bar
# - Useful for monitoring large model downloads

View File

@@ -0,0 +1,72 @@
from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
# Example 1: Basic hierarchy extraction
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
# Extract bounding box information for spatial layout awareness.
hierarchy_config_basic = HierarchyConfig(
enabled=True,
k_clusters=6, # Default: creates 6 font size clusters (H1-H6 structure)
include_bbox=True, # Include bounding box coordinates
ocr_coverage_threshold=None # No OCR coverage threshold
)
pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
result = extract_file_sync("document.pdf", config=extraction_config_basic)
# Example 2: Custom k_clusters for minimal structure
# Use 3 clusters for simpler hierarchy with minimal structure.
# Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchy_config_minimal = HierarchyConfig(
enabled=True,
k_clusters=3, # Minimal clustering: just 3 levels
include_bbox=True,
ocr_coverage_threshold=None
)
pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
result = extract_file_sync("document.pdf", config=extraction_config_minimal)
# Example 3: With OCR coverage threshold
# Trigger OCR if less than 50% of text has font data.
# Useful for documents with mixed digital and scanned content.
hierarchy_config_ocr = HierarchyConfig(
enabled=True,
k_clusters=6,
include_bbox=True,
ocr_coverage_threshold=0.5 # Trigger OCR if text coverage < 50%
)
pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
result = extract_file_sync("document.pdf", config=extraction_config_ocr)
# Field descriptions:
#
# enabled: bool (default: True)
# - Enable or disable hierarchy extraction
# - When False, hierarchy structure is not analyzed
#
# k_clusters: int (default: 6, valid: 1-7)
# - Number of font size clusters for hierarchy levels
# - 6 provides H1-H6 heading levels with body text
# - Higher values create more fine-grained hierarchy
# - Lower values create simpler structure
#
# include_bbox: bool (default: True)
# - Include bounding box coordinates in hierarchy blocks
# - Required for spatial layout awareness and document structure
# - Set to False only if space optimization is critical
#
# ocr_coverage_threshold: float | None (default: None)
# - Range: 0.0 to 1.0
# - Triggers OCR when text block coverage falls below this fraction
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
# - None means no OCR coverage-based triggering

View File

@@ -0,0 +1,17 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config = ExtractionConfig(
output_format="html",
html_output={
"theme": "github",
"embed_css": True,
},
)
result = await extract_file("document.pdf", config=config)
print(result.content) # HTML with kb-* classes and GitHub theme
asyncio.run(main())
```

View File

@@ -0,0 +1,75 @@
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
YakeParams,
RakeParams,
extract_file,
)
# Example 1: Basic YAKE configuration
# Uses YAKE algorithm with default parameters and English stopword filtering
async def basic_yake() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.0,
ngram_range=(1, 3),
language="en",
yake_params=None,
rake_params=None,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
# Example 2: Advanced YAKE with custom parameters
# Fine-tunes YAKE with custom window size for co-occurrence analysis
async def advanced_yake() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=15,
min_score=0.1,
ngram_range=(1, 2),
language="en",
yake_params=YakeParams(
window_size=1,
),
rake_params=None,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
# Example 3: RAKE configuration
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
async def rake_config() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.RAKE,
max_keywords=10,
min_score=5.0,
ngram_range=(1, 3),
language="en",
yake_params=None,
rake_params=RakeParams(
min_word_length=1,
max_words_per_phrase=3,
),
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
if __name__ == "__main__":
asyncio.run(basic_yake())

View File

@@ -0,0 +1,24 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=(1, 3),
language="en"
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content extracted: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.85,
detect_multiple=False
)
)
result = await extract_file("document.pdf", config=config)
if result.detected_languages:
print(f"Primary language: {result.detected_languages[0]}")
print(f"Content length: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,26 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
TesseractConfig,
ImagePreprocessingConfig,
)
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract",
tesseract_config=TesseractConfig(
preprocessing=ImagePreprocessingConfig(target_dpi=300),
),
),
)
result = extract_file_sync("scanned.pdf", config=config)
content_length: int = len(result.content)
table_count: int = len(result.tables)
print(f"Content length: {content_length} characters")
print(f"Tables detected: {table_count}")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_images=True,
extract_metadata=True,
passwords=["password1", "password2"],
hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,22 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_metadata=True,
hierarchy=HierarchyConfig(
enabled=True,
k_clusters=6,
include_bbox=True,
ocr_coverage_threshold=0.8
)
)
)
result = extract_file_sync("document.pdf", config=config)
# Access hierarchy information
for page in result.pages or []:
print(f"Page {page.page_number}:")
print(f" Content: {page.content[:100]}...")
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
postprocessor=PostProcessorConfig(
enabled=True,
enabled_processors=["deduplication"],
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
quality_score: float = result.quality_score or 0.0
print(f"Quality score: {quality_score:.2f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
language="eng+fra+deu",
tesseract_config=TesseractConfig(
psm=6,
oem=1,
min_confidence=0.8,
enable_table_detection=True,
),
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,10 @@
```python title="Python"
from kreuzberg import ExtractionConfig, TokenReductionConfig
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate",
preserve_important_words=True,
)
)
```

View File

@@ -0,0 +1,58 @@
```python title="usage.py"
import subprocess
import httpx
import json
from pathlib import Path
class DockerKreuzbergClient:
def __init__(self, container_name: str = "kreuzberg-api", port: int = 8000):
self.container_name = container_name
self.port = port
self.api_url = f"http://localhost:{port}/api/extract"
def start_container(self, image: str = "kreuzberg:latest"):
print("Starting Kreuzberg Docker container...")
subprocess.run(
[
"docker", "run", "-d",
"--name", self.container_name,
"-p", f"{self.port}:8000",
image,
],
check=True,
)
print(f"Container started on http://localhost:{self.port}")
async def extract_file(self, file_path: str) -> str:
file_bytes = Path(file_path).read_bytes()
files = {"file": (Path(file_path).name, file_bytes)}
async with httpx.AsyncClient() as client:
response = await client.post(self.api_url, files=files)
response.raise_for_status()
result = response.json()
return result.get("content", "")
def stop_container(self):
print("Stopping Kreuzberg Docker container...")
subprocess.run(["docker", "stop", self.container_name], check=True)
subprocess.run(["docker", "rm", self.container_name], check=True)
print("Container stopped and removed")
async def main():
docker_client = DockerKreuzbergClient()
try:
docker_client.start_container()
import asyncio
await asyncio.sleep(2)
content = await docker_client.extract_file("document.pdf")
print(f"Extracted content:\n{content}")
finally:
docker_client.stop_container()
if __name__ == "__main__":
import asyncio
asyncio.run(main())
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config = ExtractionConfig(
use_cache=True,
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from pathlib import Path
from kreuzberg import extract_file
async def main() -> None:
file_path: Path = Path("document.pdf")
result = await extract_file(file_path)
print(f"Content: {result.content}")
print(f"Format: {result.metadata.format.format_type if result.metadata.format else None}")
print(f"Tables: {len(result.tables)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, TesseractConfig
async def main() -> None:
config = ExtractionConfig(
force_ocr=True,
ocr=OcrConfig(
backend="tesseract",
language="eng",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("scanned.pdf", config=config)
print(result.content)
print(f"Detected Languages: {result.detected_languages}")
asyncio.run(main())
```

View File

@@ -0,0 +1,10 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file
async def main() -> None:
result = await extract_file("document.pdf")
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, __version__
async def main() -> None:
print(f"Kreuzberg version: {__version__}")
result = await extract_file("document.pdf")
print(f"Extraction successful: {len(result.content) > 0}")
asyncio.run(main())
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file
async def main() -> None:
result = await extract_file("document.pdf")
content: str = result.content
tables: int = len(result.tables)
format_type: str | None = result.metadata.format.format_type if result.metadata.format else None
print(f"Content length: {len(content)} characters")
print(f"Tables found: {tables}")
print(f"Format: {format_type}")
asyncio.run(main())
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, StructuredExtractionConfig, LlmConfig
async def main() -> None:
config = ExtractionConfig(
structured_extraction=StructuredExtractionConfig(
schema={
"type": "object",
"properties": {
"title": {"type": "string"},
"authors": {"type": "array", "items": {"type": "string"}},
"date": {"type": "string"},
},
"required": ["title", "authors", "date"],
"additionalProperties": False,
},
llm=LlmConfig(model="openai/gpt-4o-mini"),
strict=True,
),
)
result = await extract_file("paper.pdf", config=config)
print(result.structured_output)
# {"title": "...", "authors": ["..."], "date": "..."}
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import embed, EmbeddingConfig, EmbeddingModelType, LlmConfig
async def main() -> None:
config = EmbeddingConfig(
model=EmbeddingModelType.llm(
LlmConfig(model="openai/text-embedding-3-small")
),
normalize=True,
)
embeddings = await embed(["Hello world"], config=config)
print(len(embeddings[0])) # 1536
asyncio.run(main())
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, LlmConfig
async def main() -> None:
config = ExtractionConfig(
force_ocr=True,
ocr=OcrConfig(
backend="vlm",
vlm_config=LlmConfig(model="openai/gpt-4o-mini"),
),
)
result = await extract_file("scan.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,23 @@
```python title="Python"
import asyncio
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client
async def main() -> None:
server_params: StdioServerParameters = StdioServerParameters(
command="kreuzberg", args=["mcp"]
)
async with stdio_client(server_params) as (read, write):
async with ClientSession(read, write) as session:
await session.initialize()
tools = await session.list_tools()
tool_names: list[str] = [t.name for t in tools.tools]
print(f"Available tools: {tool_names}")
result = await session.call_tool(
"extract_file", arguments={"path": "document.pdf", "async": True}
)
print(result)
asyncio.run(main())
```

View File

@@ -0,0 +1,36 @@
```python title="Python"
from langchain.agents import initialize_agent, AgentType
from langchain.tools import Tool
from langchain_openai import ChatOpenAI
import subprocess
import json
mcp_process = subprocess.Popen(
["kreuzberg", "mcp"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
def extract_file(path: str) -> str:
request: dict = {
"method": "tools/call",
"params": {
"name": "extract_file",
"arguments": {"path": path, "async": True},
},
}
mcp_process.stdin.write(json.dumps(request).encode() + b"\n")
mcp_process.stdin.flush()
response = mcp_process.stdout.readline()
return json.loads(response)["result"]["content"]
tools: list[Tool] = [
Tool(name="extract_document", func=extract_file, description="Extract")
]
llm = ChatOpenAI(temperature=0)
agent = initialize_agent(
tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
)
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
import subprocess
import time
from typing import Optional
mcp_process: subprocess.Popen = subprocess.Popen(
["python", "-m", "kreuzberg", "mcp"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
pid: Optional[int] = mcp_process.pid
print(f"MCP server started with PID: {pid}")
time.sleep(1)
print("Server is running, listening for connections")
```

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
config = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.9,
detect_multiple=True,
),
)
print(config.language_detection)
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
config = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=True,
),
)
result = extract_file_sync("multilingual_document.pdf", config=config)
if result.detected_languages:
print(f"Detected languages: {', '.join(result.detected_languages)}")
```

View File

@@ -0,0 +1,80 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata.get("page_count"):
print(f"Pages: {metadata['page_count']}")
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("authors"):
print(f"Authors: {', '.join(metadata['authors'])}")
result = extract_file_sync("page.html", config=ExtractionConfig())
metadata = result.metadata
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("description"):
print(f"Description: {metadata['description']}")
# Access keywords as array
keywords = metadata.get('keywords', [])
if keywords:
print(f"Keywords: {', '.join(keywords)}")
# Access canonical URL (renamed from canonical)
canonical_url = metadata.get('canonical_url')
if canonical_url:
print(f"Canonical URL: {canonical_url}")
# Access Open Graph fields from map
open_graph = metadata.get('open_graph', {})
if open_graph:
if 'image' in open_graph:
print(f"Open Graph Image: {open_graph['image']}")
if 'title' in open_graph:
print(f"Open Graph Title: {open_graph['title']}")
if 'type' in open_graph:
print(f"Open Graph Type: {open_graph['type']}")
# Access Twitter Card fields from map
twitter_card = metadata.get('twitter_card', {})
if twitter_card:
if 'card' in twitter_card:
print(f"Twitter Card Type: {twitter_card['card']}")
if 'creator' in twitter_card:
print(f"Twitter Creator: {twitter_card['creator']}")
# Access new fields
language = metadata.get('language')
if language:
print(f"Language: {language}")
text_direction = metadata.get('text_direction')
if text_direction:
print(f"Text Direction: {text_direction}")
# Access headers
headers = metadata.get('headers', [])
if headers:
print(f"Headers: {', '.join([h['text'] for h in headers])}")
# Access links
links = metadata.get('links', [])
if links:
for link in links:
print(f"Link: {link.get('href')} ({link.get('text')})")
# Access images
images = metadata.get('images', [])
if images:
for image in images:
print(f"Image: {image.get('src')}")
# Access structured data
structured_data = metadata.get('structured_data', [])
if structured_data:
print(f"Structured data items: {len(structured_data)}")
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
if result.metadata.pages and result.metadata.pages.boundaries:
boundaries = result.metadata.pages.boundaries
content_bytes = result.content.encode("utf-8")
for boundary in boundaries[:3]:
page_bytes = content_bytes[boundary.byte_start:boundary.byte_end]
page_text = page_bytes.decode("utf-8")
print(f"Page {boundary.page_number}:")
print(f" Byte range: {boundary.byte_start}-{boundary.byte_end}")
print(f" Preview: {page_text[:100]}...")
```

View File

@@ -0,0 +1,14 @@
From Kreuzberg import extract_file_sync, ExtractionConfig, PageConfig
Config = ExtractionConfig(
pages=PageConfig(extract_pages=True)
)
Result = extract_file_sync("document.pdf", config=config)
If result.pages:
for page in result.pages:
print(f"Page {page.page_number}:")
print(f" Content: {len(page.content)} chars")
print(f" Tables: {len(page.tables)}")
print(f" Images: {len(page.images)}")

View File

@@ -0,0 +1,39 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
import logging
logger = logging.getLogger(__name__)
class PdfMetadataExtractor:
def __init__(self):
self.processed_count: int = 0
def name(self) -> str:
return "pdf_metadata_extractor"
def version(self) -> str:
return "1.0.0"
def description(self) -> str:
return "Extracts and enriches PDF metadata"
def processing_stage(self) -> str:
return "early"
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
def process(self, result: ExtractionResult) -> ExtractionResult:
self.processed_count += 1
result.metadata["pdf_processed"] = True
return result
def initialize(self) -> None:
logger.info("PDF metadata extractor initialized")
def shutdown(self) -> None:
logger.info(f"Processed {self.processed_count} PDFs")
processor: PdfMetadataExtractor = PdfMetadataExtractor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,35 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
config = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=512,
overlap=50,
embedding=EmbeddingConfig(
normalize=True,
batch_size=32,
preset="balanced",
),
),
)
result = extract_file_sync("document.pdf", config=config)
records: list[dict] = []
if result.chunks:
for index, chunk in enumerate(result.chunks):
if chunk.embedding is None:
continue
records.append({
"id": f"document_chunk_{index}",
"content": chunk.content,
"embedding": chunk.embedding,
"metadata": {
"document_id": "document.pdf",
"chunk_index": index,
"content_length": len(chunk.content),
},
})
print(f"Prepared {len(records)} vector records")
```

View File

@@ -0,0 +1,37 @@
```python title="Python"
from kreuzberg import register_ocr_backend
import httpx
class CloudOcrBackend:
def __init__(self, api_key: str):
self.api_key: str = api_key
self.langs: list[str] = ["eng", "deu", "fra"]
def name(self) -> str:
return "cloud-ocr"
def version(self) -> str:
return "1.0.0"
def supported_languages(self) -> list[str]:
return self.langs
def process_image(self, image_bytes: bytes, config: dict) -> dict:
with httpx.Client() as client:
response = client.post(
"https://api.example.com/ocr",
files={"image": image_bytes},
json={"language": config.get("language", "eng")},
)
text: str = response.json()["text"]
return {"content": text, "mime_type": "text/plain"}
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
backend: CloudOcrBackend = CloudOcrBackend(api_key="your-api-key")
register_ocr_backend(backend)
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ImageExtractionConfig
config: ExtractionConfig = ExtractionConfig(
images=ImageExtractionConfig(
extract_images=True,
target_dpi=200,
max_image_dimension=2048,
inject_placeholders=True, # set to False to extract images without markdown references
auto_adjust_dpi=True,
)
)
result = extract_file_sync("document.pdf", config=config)
print(f"Content length: {len(result.content)} characters")
```

View File

@@ -0,0 +1,29 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
ImagePreprocessingConfig,
OcrConfig,
TesseractConfig,
)
preprocessing: ImagePreprocessingConfig = ImagePreprocessingConfig(
target_dpi=300,
denoise=True,
deskew=True,
contrast_enhance=True,
binarization_method="otsu",
)
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract",
language="eng",
tesseract_config=TesseractConfig(preprocessing=preprocessing),
)
)
result = extract_file_sync("document.pdf", config=config)
print(f"Content length: {len(result.content)} characters")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="easyocr", language="en")
)
# EasyOCR-specific options (use_gpu, beam_width, etc.) go in easyocr_kwargs,
# not in OcrConfig — OcrConfig only accepts backend, language, and backend-specific configs.
result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True})
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="paddleocr", language="en")
)
result = extract_file_sync("scanned.pdf", config=config)
if result.ocr_elements:
for element in result.ocr_elements:
print(f"Text: {element.text}")
print(f"Confidence: {element.confidence.recognition:.2f}")
print(f"Geometry: {element.geometry}")
if element.rotation:
print(f"Rotation: {element.rotation.angle}°")
print()
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract"),
force_ocr=True,
)
result = extract_file_sync("document.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng+deu+fra")
)
result = extract_file_sync("multilingual.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="paddleocr", language="en") # model_tier="server" for max accuracy
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
from kreuzberg import (
clear_document_extractors,
clear_post_processors,
clear_ocr_backends,
clear_validators,
)
clear_post_processors()
clear_validators()
clear_ocr_backends()
clear_document_extractors()
print("All plugins cleared")
```

View File

@@ -0,0 +1,44 @@
```python title="Python"
from kreuzberg import register_embedding_backend, EmbeddingConfig, embed_texts
from sentence_transformers import SentenceTransformer
# Wrap an already-loaded embedder (e.g. sentence-transformers, llama-cpp-python,
# or a tuned ONNX session) so kreuzberg can call back into it during chunking
# and standalone embed requests.
class MyEmbedder:
def __init__(self):
self._model = SentenceTransformer("BAAI/bge-base-en-v1.5")
# Plugin trait hooks
def name(self) -> str:
return "my-embedder"
def version(self) -> str:
return "1.0.0"
def initialize(self) -> None:
# Optional warm-up; runs once at registration before dimensions() is cached.
pass
def shutdown(self) -> None:
pass
# EmbeddingBackend hooks
def dimensions(self) -> int:
# Captured once at registration; the dispatcher uses this for shape validation.
return self._model.get_sentence_embedding_dimension()
def embed(self, texts: list[str]) -> list[list[float]]:
return self._model.encode(texts).tolist()
# Register once at startup. Reference by name in config.
register_embedding_backend(MyEmbedder())
config: EmbeddingConfig = {
"model": {"type": "plugin", "name": "my-embedder"},
# Optional: bound the wait on a hung backend (default: 60s; None disables)
"max_embed_duration_secs": 30,
}
vectors = embed_texts(["Hello, world!", "Second text"], config)
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
from kreuzberg import register_document_extractor
class CustomExtractor:
def name(self) -> str:
return "custom"
def version(self) -> str:
return "1.0.0"
extractor = CustomExtractor()
register_document_extractor(extractor)
print("Extractor registered")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
list_document_extractors,
list_post_processors,
list_ocr_backends,
list_validators,
)
extractors: list[str] = list_document_extractors()
processors: list[str] = list_post_processors()
ocr_backends: list[str] = list_ocr_backends()
validators: list[str] = list_validators()
print(f"Extractors: {extractors}")
print(f"Processors: {processors}")
print(f"OCR backends: {ocr_backends}")
print(f"Validators: {validators}")
```

View File

@@ -0,0 +1,33 @@
```python title="Python"
from kreuzberg import register_validator, ExtractionResult, ValidationError
class MinLengthValidator:
def __init__(self, min_length: int = 100):
self.min_length: int = min_length
def name(self) -> str:
return "min_length_validator"
def version(self) -> str:
return "1.0.0"
def priority(self) -> int:
return 100
def validate(self, result: ExtractionResult) -> None:
content_len: int = len(result.content)
if content_len < self.min_length:
raise ValidationError(f"Content too short: {content_len}")
def should_validate(self, result: ExtractionResult) -> bool:
return True
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
validator: MinLengthValidator = MinLengthValidator(min_length=100)
register_validator(validator)
```

View File

@@ -0,0 +1,42 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
import logging
logger = logging.getLogger(__name__)
class PdfMetadataExtractor:
def __init__(self):
self.processed_count: int = 0
def name(self) -> str:
return "pdf_metadata_extractor"
def version(self) -> str:
return "1.0.0"
def description(self) -> str:
return "Extracts and enriches PDF metadata"
def processing_stage(self) -> str:
return "early"
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
def process(self, result: ExtractionResult) -> ExtractionResult:
self.processed_count += 1
result.metadata["pdf_processed"] = True
result.metadata["pdf_order"] = self.processed_count
result.metadata["content_length"] = len(result.content)
result.metadata["pdf_processor_version"] = "1.0.0"
return result
def initialize(self) -> None:
logger.info("PDF metadata extractor initialized")
def shutdown(self) -> None:
logger.info(f"Processed {self.processed_count} PDFs")
processor: PdfMetadataExtractor = PdfMetadataExtractor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
from kreuzberg import ExtractionResult, register_post_processor
class PdfOnlyProcessor:
def name(self) -> str:
return "pdf-only-processor"
def version(self) -> str:
return "1.0.0"
def process(self, result: ExtractionResult) -> ExtractionResult:
return result
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
processor: PdfOnlyProcessor = PdfOnlyProcessor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,42 @@
```python title="Python"
from kreuzberg import register_document_extractor, ExtractionResult
import json
class CustomJsonExtractor:
def name(self) -> str:
return "custom-json-extractor"
def version(self) -> str:
return "1.0.0"
def supported_mime_types(self) -> list[str]:
return ["application/json"]
def priority(self) -> int:
return 50
def extract_bytes(
self, content: bytes, mime_type: str, config: dict
) -> ExtractionResult:
data: dict = json.loads(content)
text: str = self._extract_text(data)
return {"content": text, "mime_type": "application/json"}
def _extract_text(self, obj: object) -> str:
if isinstance(obj, str):
return f"{obj}\n"
if isinstance(obj, list):
return "".join(self._extract_text(item) for item in obj)
if isinstance(obj, dict):
return "".join(self._extract_text(v) for v in obj.values())
return ""
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
extractor: CustomJsonExtractor = CustomJsonExtractor()
register_document_extractor(extractor)
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
import logging
logger = logging.getLogger(__name__)
class MyPlugin:
def name(self) -> str:
return "my-plugin"
def version(self) -> str:
return "1.0.0"
def initialize(self) -> None:
logger.info(f"Initializing plugin: {self.name()}")
def shutdown(self) -> None:
logger.info(f"Shutting down plugin: {self.name()}")
def extract_bytes(
self, content: bytes, mime_type: str, config: dict
) -> dict:
logger.info(f"Extracting {mime_type} ({len(content)} bytes)")
result: dict = {"content": "", "mime_type": mime_type}
if not result["content"]:
logger.warning("Extraction resulted in empty content")
return result
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import pytest
from kreuzberg import ExtractionResult
def test_custom_extractor() -> None:
extractor = CustomJsonExtractor()
json_data: bytes = b'{"message": "Hello, world!"}'
config: dict = {}
result: ExtractionResult = extractor.extract_bytes(
json_data, "application/json", config
)
assert "Hello, world!" in result.content
assert result.mime_type == "application/json"
```

View File

@@ -0,0 +1,34 @@
```python title="Python"
from kreuzberg import (
ExtractionResult,
ValidationError,
extract_file_sync,
register_validator,
)
class MinLengthValidator:
def name(self) -> str:
return "min_length"
def version(self) -> str:
return "1.0.0"
def validate(self, result: ExtractionResult) -> None:
if len(result.content) < 50:
raise ValidationError(f"Content too short: {len(result.content)}")
def should_validate(self, result: ExtractionResult) -> bool:
return True
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
validator: MinLengthValidator = MinLengthValidator()
register_validator(validator)
result = extract_file_sync("document.pdf")
print(f"Content length: {len(result.content)}")
```

View File

@@ -0,0 +1,20 @@
```python title="Python"
from kreuzberg import ExtractionResult, ValidationError, register_validator
class QualityValidator:
def name(self) -> str:
return "quality-validator"
def version(self) -> str:
return "1.0.0"
def validate(self, result: ExtractionResult) -> None:
score: float = result.metadata.get("quality_score", 0.0)
if score < 0.5:
raise ValidationError(
f"Quality score too low: {score:.2f}"
)
validator: QualityValidator = QualityValidator()
register_validator(validator)
```

View File

@@ -0,0 +1,28 @@
```python title="Python"
import threading
from kreuzberg import ExtractionResult
class StatefulPlugin:
def __init__(self):
self.lock: threading.Lock = threading.Lock()
self.call_count: int = 0
self.cache: dict = {}
def name(self) -> str:
return "stateful-plugin"
def version(self) -> str:
return "1.0.0"
def process(self, result: ExtractionResult) -> ExtractionResult:
with self.lock:
self.call_count += 1
self.cache["last_mime"] = result.mime_type
return result
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
```

View File

@@ -0,0 +1,20 @@
```python title="Python"
from kreuzberg import (
unregister_document_extractor,
unregister_post_processor,
unregister_ocr_backend,
unregister_validator,
)
names: list[str] = [
"custom-json-extractor",
"word_count",
"cloud-ocr",
"min_length_validator",
]
unregister_document_extractor(names[0])
unregister_post_processor(names[1])
unregister_ocr_backend(names[2])
unregister_validator(names[3])
```

View File

@@ -0,0 +1,30 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
class WordCountProcessor:
def name(self) -> str:
return "word_count"
def version(self) -> str:
return "1.0.0"
def processing_stage(self) -> str:
return "early"
def process(self, result: ExtractionResult) -> ExtractionResult:
word_count: int = len(result.content.split())
result.metadata["word_count"] = word_count
return result
def should_process(self, result: ExtractionResult) -> bool:
return bool(result.content)
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
processor: WordCountProcessor = WordCountProcessor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1500,
max_overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-minilm-l6-v2")
),
)
)
```

View File

@@ -0,0 +1,36 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=500,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=16
)
)
)
result = await extract_file("research_paper.pdf", config=config)
chunks_with_embeddings: list = []
for chunk in result.chunks or []:
if chunk.embedding:
chunks_with_embeddings.append({
"content": chunk.content[:100],
"embedding_dims": len(chunk.embedding)
})
print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1024,
max_overlap=100,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=32,
show_download_progress=False,
),
)
)
```

Some files were not shown because too many files have changed in this diff Show More