Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
LanguageDetectionConfig,
TokenReductionOptions,
PostProcessorConfig,
KeywordConfig,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
enable_quality_processing=True,
ocr=OcrConfig(
backend="tesseract",
language="eng",
),
chunking=ChunkingConfig(
max_characters=1000,
overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
batch_size=32,
normalize=True,
),
),
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=False,
),
keywords=KeywordConfig(
algorithm="yake",
max_keywords=10,
min_score=0.1,
ngram_range=[1, 3],
language="en",
),
token_reduction=TokenReductionOptions(
mode="moderate",
preserve_important_words=True,
),
postprocessor=PostProcessorConfig(enabled=True),
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
if result.chunks:
print(f"Chunks: {len(result.chunks)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,80 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Chunks: {len(result.chunks or [])}")
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
```
```python title="Python - Markdown with Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
sizing_type="tokenizer",
sizing_model="Xenova/gpt-4o",
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
heading_context = chunk.metadata.get("heading_context")
if heading_context:
headings = heading_context.get("headings", [])
for h in headings:
print(f"Heading L{h['level']}: {h['text']}")
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Semantic"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(chunker_type="semantic")
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Prepend Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config = ExtractionConfig(
use_cache=True,
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract", language="eng+fra",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
)
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
ocr=OcrConfig(backend="tesseract", language="eng"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
)
result = extract_file_sync("document.pdf", config=config)
content_length: int = len(result.content)
print(f"Content length: {content_length}")
```

View File

@@ -0,0 +1,15 @@
```python title="Document Structure Config (Python)"
from kreuzberg import extract_file_sync, ExtractionConfig
# Enable document structure output
config = ExtractionConfig(include_document_structure=True)
result = extract_file_sync("document.pdf", config=config)
# Access the document tree
if result.document:
for node in result.document["nodes"]:
node_type = node["content"]["node_type"]
text = node["content"].get("text", "")
print(f"[{node_type}] {text[:80]}")
```

View File

@@ -0,0 +1,29 @@
```python title="Element-Based Output (Python)"
from kreuzberg import extract_file_sync, ExtractionConfig
# Configure element-based output
config = ExtractionConfig(result_format="element_based")
# Extract document
result = extract_file_sync("document.pdf", config=config)
# Access elements
for element in result.elements:
print(f"Type: {element.element_type}")
print(f"Text: {element.text[:100]}")
if element.metadata.page_number:
print(f"Page: {element.metadata.page_number}")
if element.metadata.coordinates:
coords = element.metadata.coordinates
print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
print("---")
# Filter by element type
titles = [e for e in result.elements if e.element_type == "title"]
for title in titles:
level = title.metadata.additional.get("level", "unknown")
print(f"[{level}] {title.text}")
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
config = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-mpnet-base-v2"),
batch_size=16,
normalize=True,
show_download_progress=True
)
)
)
```

View File

@@ -0,0 +1,83 @@
from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
# Example 1: Preset model (recommended)
# Fast, balanced, or quality preset configurations optimized for common use cases.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
batch_size=32,
normalize=True,
show_download_progress=True,
cache_dir="~/.cache/kreuzberg/embeddings",
)
# Available presets:
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
# - "quality" (1024 dims): Complex documents, maximum accuracy
# - "multilingual" (768 dims): International documents, 100+ languages
# Example 2: Custom ONNX model (requires embeddings feature)
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.custom(
model_id="BAAI/bge-small-en-v1.5",
dimensions=384,
),
batch_size=32,
normalize=True,
show_download_progress=True,
cache_dir=None, # Uses default: .kreuzberg/embeddings/
)
# Popular ONNX-compatible models:
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
# Example 3: Alternative Custom Model
# For advanced users wanting alternative ONNX embedding models.
embedding_config = EmbeddingConfig(
model=EmbeddingModelType.custom(
model_id="sentence-transformers/all-mpnet-base-v2",
dimensions=768,
),
batch_size=16, # Larger model requires smaller batch size
normalize=True,
show_download_progress=True,
cache_dir="/var/cache/embeddings",
)
# Integration with ChunkingConfig
# Add embeddings to your chunking configuration:
chunking_with_embeddings = ChunkingConfig(
max_chars=1024,
max_overlap=100,
preset="balanced",
embedding=EmbeddingConfig(), # Uses balanced preset
)
extraction_config = ExtractionConfig(
chunking=chunking_with_embeddings,
)
# Key parameter explanations:
#
# batch_size: Number of texts to embed at once (32-128 typical)
# - Larger batches are faster but use more memory
# - Smaller batches for resource-constrained environments
#
# normalize: Whether to normalize vectors (L2 norm)
# - True (recommended): Enables cosine similarity in vector DBs
# - False: Raw embedding values
#
# cache_dir: Where to store downloaded models
# - None: Uses .kreuzberg/embeddings/ in current directory
# - String path: Custom directory for model storage
#
# show_download_progress: Display download progress bar
# - Useful for monitoring large model downloads

View File

@@ -0,0 +1,72 @@
from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
# Example 1: Basic hierarchy extraction
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
# Extract bounding box information for spatial layout awareness.
hierarchy_config_basic = HierarchyConfig(
enabled=True,
k_clusters=6, # Default: creates 6 font size clusters (H1-H6 structure)
include_bbox=True, # Include bounding box coordinates
ocr_coverage_threshold=None # No OCR coverage threshold
)
pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
result = extract_file_sync("document.pdf", config=extraction_config_basic)
# Example 2: Custom k_clusters for minimal structure
# Use 3 clusters for simpler hierarchy with minimal structure.
# Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchy_config_minimal = HierarchyConfig(
enabled=True,
k_clusters=3, # Minimal clustering: just 3 levels
include_bbox=True,
ocr_coverage_threshold=None
)
pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
result = extract_file_sync("document.pdf", config=extraction_config_minimal)
# Example 3: With OCR coverage threshold
# Trigger OCR if less than 50% of text has font data.
# Useful for documents with mixed digital and scanned content.
hierarchy_config_ocr = HierarchyConfig(
enabled=True,
k_clusters=6,
include_bbox=True,
ocr_coverage_threshold=0.5 # Trigger OCR if text coverage < 50%
)
pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
result = extract_file_sync("document.pdf", config=extraction_config_ocr)
# Field descriptions:
#
# enabled: bool (default: True)
# - Enable or disable hierarchy extraction
# - When False, hierarchy structure is not analyzed
#
# k_clusters: int (default: 6, valid: 1-7)
# - Number of font size clusters for hierarchy levels
# - 6 provides H1-H6 heading levels with body text
# - Higher values create more fine-grained hierarchy
# - Lower values create simpler structure
#
# include_bbox: bool (default: True)
# - Include bounding box coordinates in hierarchy blocks
# - Required for spatial layout awareness and document structure
# - Set to False only if space optimization is critical
#
# ocr_coverage_threshold: float | None (default: None)
# - Range: 0.0 to 1.0
# - Triggers OCR when text block coverage falls below this fraction
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
# - None means no OCR coverage-based triggering

View File

@@ -0,0 +1,17 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config = ExtractionConfig(
output_format="html",
html_output={
"theme": "github",
"embed_css": True,
},
)
result = await extract_file("document.pdf", config=config)
print(result.content) # HTML with kb-* classes and GitHub theme
asyncio.run(main())
```

View File

@@ -0,0 +1,75 @@
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
YakeParams,
RakeParams,
extract_file,
)
# Example 1: Basic YAKE configuration
# Uses YAKE algorithm with default parameters and English stopword filtering
async def basic_yake() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.0,
ngram_range=(1, 3),
language="en",
yake_params=None,
rake_params=None,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
# Example 2: Advanced YAKE with custom parameters
# Fine-tunes YAKE with custom window size for co-occurrence analysis
async def advanced_yake() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=15,
min_score=0.1,
ngram_range=(1, 2),
language="en",
yake_params=YakeParams(
window_size=1,
),
rake_params=None,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
# Example 3: RAKE configuration
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
async def rake_config() -> None:
config = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.RAKE,
max_keywords=10,
min_score=5.0,
ngram_range=(1, 3),
language="en",
yake_params=None,
rake_params=RakeParams(
min_word_length=1,
max_words_per_phrase=3,
),
)
)
result = await extract_file("document.pdf", config=config)
print(f"Keywords: {result.keywords}")
if __name__ == "__main__":
asyncio.run(basic_yake())

View File

@@ -0,0 +1,24 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=(1, 3),
language="en"
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content extracted: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.85,
detect_multiple=False
)
)
result = await extract_file("document.pdf", config=config)
if result.detected_languages:
print(f"Primary language: {result.detected_languages[0]}")
print(f"Content length: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,26 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
TesseractConfig,
ImagePreprocessingConfig,
)
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract",
tesseract_config=TesseractConfig(
preprocessing=ImagePreprocessingConfig(target_dpi=300),
),
),
)
result = extract_file_sync("scanned.pdf", config=config)
content_length: int = len(result.content)
table_count: int = len(result.tables)
print(f"Content length: {content_length} characters")
print(f"Tables detected: {table_count}")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_images=True,
extract_metadata=True,
passwords=["password1", "password2"],
hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,22 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_metadata=True,
hierarchy=HierarchyConfig(
enabled=True,
k_clusters=6,
include_bbox=True,
ocr_coverage_threshold=0.8
)
)
)
result = extract_file_sync("document.pdf", config=config)
# Access hierarchy information
for page in result.pages or []:
print(f"Page {page.page_number}:")
print(f" Content: {page.content[:100]}...")
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
postprocessor=PostProcessorConfig(
enabled=True,
enabled_processors=["deduplication"],
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
quality_score: float = result.quality_score or 0.0
print(f"Quality score: {quality_score:.2f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
language="eng+fra+deu",
tesseract_config=TesseractConfig(
psm=6,
oem=1,
min_confidence=0.8,
enable_table_detection=True,
),
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,10 @@
```python title="Python"
from kreuzberg import ExtractionConfig, TokenReductionConfig
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate",
preserve_important_words=True,
)
)
```