Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
```python title="Python"
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
TokenReductionConfig,
LanguageDetectionConfig,
)
config = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
token_reduction=TokenReductionConfig(mode="light"),
language_detection=LanguageDetectionConfig(
enabled=True, detect_multiple=True
),
use_cache=True,
enable_quality_processing=True,
)
result = extract_file_sync("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Chunk: {chunk.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
config = ExtractionConfig(
chunking=ChunkingConfig(max_characters=500, overlap=50),
)
result = extract_file_sync("document.pdf", config=config)
if result.chunks:
for chunk in result.chunks:
first = chunk.metadata.first_page
last = chunk.metadata.last_page
if first is None:
continue
page_range = f"Page {first}" if first == last else f"Pages {first}-{last}"
print(f"Chunk: {chunk.content[:50]}... ({page_range})")
```

View File

@@ -0,0 +1,59 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=1000,
overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
```
```python title="Python - Semantic"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(chunker_type="semantic")
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Prepend Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_characters=500,
overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,39 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=500,
overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
),
)
)
result = await extract_file("research_paper.pdf", config=config)
for chunk in result.chunks or []:
print(
f"Chunk {chunk.metadata.chunk_index + 1}/{chunk.metadata.total_chunks}"
)
print(
f"Position: {chunk.metadata.byte_start}-{chunk.metadata.byte_end}"
)
print(f"Content: {chunk.content[:100]}...")
if chunk.embedding is not None:
print(f"Embedding: {len(chunk.embedding)} dimensions")
asyncio.run(main())
```

View File

@@ -0,0 +1,34 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
LanguageDetectionConfig,
TokenReductionConfig,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
language_detection=LanguageDetectionConfig(enabled=True),
token_reduction=TokenReductionConfig(mode="moderate"),
chunking=ChunkingConfig(
max_chars=512,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"), normalize=True
),
),
)
result = await extract_file("document.pdf", config=config)
quality = result.quality_score or 0
print(f"Quality: {quality:.2f}")
print(f"Languages: {result.detected_languages}")
if result.chunks:
print(f"Chunks: {len(result.chunks)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,36 @@
```python title="Python"
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
OcrConfig,
TesseractConfig,
PdfConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
enable_quality_processing=True,
ocr=OcrConfig(
backend="tesseract",
language="eng+fra",
tesseract_config=TesseractConfig(psm=3),
),
pdf_options=PdfConfig(extract_images=True),
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
),
),
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=1024,
overlap=100,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
batch_size=32,
show_download_progress=False,
),
)
)
```

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import ExtractionConfig, KeywordConfig, KeywordAlgorithm
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=[1, 3],
language="en",
)
)
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
)
)
result = await extract_file("research_paper.pdf", config=config)
for keyword in result.extracted_keywords or []:
print(f"{keyword.text}: {keyword.score:.3f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=False,
)
)
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=True,
)
)
result = await extract_file("multilingual_document.pdf", config=config)
print(f"Detected languages: {result.detected_languages}")
asyncio.run(main())
```

View File

@@ -0,0 +1,7 @@
```python title="Python"
from kreuzberg import ExtractionConfig
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
)
```

View File

@@ -0,0 +1,21 @@
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
)
result = await extract_file("scanned_document.pdf", config=config)
if result.quality_score is not None:
if result.quality_score < 0.5:
print(f"Warning: Low quality extraction ({result.quality_score:.2f})")
else:
print(f"Quality score: {result.quality_score:.2f}")
asyncio.run(main())
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
from kreuzberg import ExtractionConfig, TokenReductionConfig, ReductionLevel
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
level=ReductionLevel.MODERATE,
preserve_markdown=True,
preserve_code=True,
language_hint="eng",
)
)
```

View File

@@ -0,0 +1,25 @@
```python title="Python"
import asyncio
from kreuzberg import (
ExtractionConfig,
TokenReductionConfig,
ReductionLevel,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
level=ReductionLevel.MODERATE,
preserve_markdown=True,
)
)
result = await extract_file("verbose_document.pdf", config=config)
print(f"Reduced content length: {len(result.content)} chars")
asyncio.run(main())
```

View File

@@ -0,0 +1,59 @@
```python title="Python"
import asyncio
from dataclasses import dataclass, field
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
extract_file,
)
@dataclass
class VectorRecord:
id: str
content: str
embedding: list[float]
metadata: dict[str, str] = field(default_factory=dict)
async def extract_and_vectorize(
document_path: str,
document_id: str,
) -> list[VectorRecord]:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=512,
overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
normalize=True,
batch_size=32,
),
)
)
result = await extract_file(document_path, config=config)
records: list[VectorRecord] = []
for index, chunk in enumerate(result.chunks or []):
if chunk.embedding is None:
continue
records.append(
VectorRecord(
id=f"{document_id}_chunk_{index}",
content=chunk.content,
embedding=chunk.embedding,
metadata={
"document_id": document_id,
"chunk_index": str(index),
"content_length": str(len(chunk.content)),
},
)
)
return records
asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
```