Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1500,
max_overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-minilm-l6-v2")
),
)
)
```

View File

@@ -0,0 +1,36 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=500,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=16
)
)
)
result = await extract_file("research_paper.pdf", config=config)
chunks_with_embeddings: list = []
for chunk in result.chunks or []:
if chunk.embedding:
chunks_with_embeddings.append({
"content": chunk.content[:100],
"embedding_dims": len(chunk.embedding)
})
print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1024,
max_overlap=100,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=32,
show_download_progress=False,
),
)
)
```

View File

@@ -0,0 +1,33 @@
```python title="Python"
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except FileNotFoundError as e:
print(f"File not found: {e}")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import httpx
try:
with httpx.Client() as client:
with open("document.pdf", "rb") as f:
files: dict = {"files": f}
response: httpx.Response = client.post(
"http://localhost:8000/extract", files=files
)
response.raise_for_status()
results: list = response.json()
print(f"Extracted {len(results)} documents")
except httpx.HTTPStatusError as e:
error: dict = e.response.json()
error_type: str = error.get("error_type", "Unknown")
message: str = error.get("message", "No message")
print(f"Error: {error_type}: {message}")
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ImageExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
images=ImageExtractionConfig(
extract_images=True,
target_dpi=200,
max_image_dimension=2048,
inject_placeholders=True, # set to False to extract images without markdown references
auto_adjust_dpi=True,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Extracted: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,29 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
OcrConfig,
TesseractConfig,
ImagePreprocessingConfig,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
tesseract_config=TesseractConfig(
preprocessing=ImagePreprocessingConfig(
target_dpi=300,
denoise=True,
deskew=True,
contrast_enhance=True,
binarization_method="otsu",
)
)
)
)
result = await extract_file("scanned.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,22 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3
)
)
result = await extract_file("research_paper.pdf", config=config)
keywords: list = result.extracted_keywords or []
for kw in keywords:
score: float = kw.score or 0.0
text: str = kw.text or ""
print(f"{text}: {score:.3f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True, min_confidence=0.9, detect_multiple=True
)
)
result = await extract_file("document.pdf", config=config)
print(f"Languages: {result.detected_languages}")
asyncio.run(main())
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.7,
detect_multiple=True
)
)
result = await extract_file("multilingual_document.pdf", config=config)
languages: list[str] = result.detected_languages or []
print(f"Detected {len(languages)} languages: {languages}")
asyncio.run(main())
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
from kreuzberg import extract_file, ExtractionConfig
config = ExtractionConfig(enable_quality_processing=True)
result = extract_file("scanned_document.pdf", config=config)
quality_score = result.quality_score or 0.0
if quality_score < 0.5:
print(f"Warning: Low quality extraction ({quality_score:.2f})")
print("Consider re-scanning with higher DPI or adjusting OCR settings")
else:
print(f"Quality score: {quality_score:.2f}")
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
from kreuzberg import embed_sync, embed, EmbeddingConfig, EmbeddingModelType
# Synchronous
embeddings = embed_sync(
["Hello, world!", "Kreuzberg is fast"],
config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
)
assert len(embeddings) == 2
assert len(embeddings[0]) == 768
# Asynchronous
async def main():
embeddings = await embed(
["Hello, world!", "Kreuzberg is fast"],
config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
)
assert len(embeddings) == 2
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
result = extract_file_sync("document.pdf", config=ExtractionConfig())
for table in result.tables:
row_count: int = len(table.cells)
print(f"Table with {row_count} rows")
print(table.markdown)
for row in table.cells:
print(row)
```

View File

@@ -0,0 +1,15 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, TokenReductionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate", preserve_important_words=True
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content length: {len(result.content)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig, TokenReductionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate", preserve_important_words=True
)
)
result = await extract_file("verbose_document.pdf", config=config)
original: int = result.metadata.get("original_token_count", 0)
reduced: int = result.metadata.get("token_count", 0)
ratio: float = result.metadata.get("token_reduction_ratio", 0.0)
print(f"Reduced from {original} to {reduced} tokens")
print(f"Reduction: {ratio * 100:.1f}%")
asyncio.run(main())
```

View File

@@ -0,0 +1,28 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=512,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"), normalize=True
),
)
)
result = await extract_file("document.pdf", config=config)
chunks = result.chunks or []
for i, chunk in enumerate(chunks):
chunk_id: str = f"doc_chunk_{i}"
print(f"Chunk {chunk_id}: {chunk.content[:50]}")
asyncio.run(main())
```