This commit is contained in:
29
docs/snippets/python/advanced/advanced_config.md
Normal file
29
docs/snippets/python/advanced/advanced_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
TokenReductionConfig,
|
||||
LanguageDetectionConfig,
|
||||
)
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
|
||||
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
|
||||
token_reduction=TokenReductionConfig(mode="light"),
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True, detect_multiple=True
|
||||
),
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Chunk: {chunk.content[:100]}")
|
||||
|
||||
if result.detected_languages:
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
```
|
||||
18
docs/snippets/python/advanced/chunk_page_mapping.md
Normal file
18
docs/snippets/python/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(max_characters=500, overlap=50),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
if result.chunks:
|
||||
for chunk in result.chunks:
|
||||
first = chunk.metadata.first_page
|
||||
last = chunk.metadata.last_page
|
||||
if first is None:
|
||||
continue
|
||||
page_range = f"Page {first}" if first == last else f"Pages {first}-{last}"
|
||||
print(f"Chunk: {chunk.content[:50]}... ({page_range})")
|
||||
```
|
||||
59
docs/snippets/python/advanced/chunking_config.md
Normal file
59
docs/snippets/python/advanced/chunking_config.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1000,
|
||||
overlap=200,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Length: {len(chunk.content)}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Semantic"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(chunker_type="semantic")
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Prepend Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_characters=500,
|
||||
overlap=50,
|
||||
prepend_heading_context=True,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
39
docs/snippets/python/advanced/chunking_rag.md
Normal file
39
docs/snippets/python/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=500,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
for chunk in result.chunks or []:
|
||||
print(
|
||||
f"Chunk {chunk.metadata.chunk_index + 1}/{chunk.metadata.total_chunks}"
|
||||
)
|
||||
print(
|
||||
f"Position: {chunk.metadata.byte_start}-{chunk.metadata.byte_end}"
|
||||
)
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
if chunk.embedding is not None:
|
||||
print(f"Embedding: {len(chunk.embedding)} dimensions")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
34
docs/snippets/python/advanced/combining_all_features.md
Normal file
34
docs/snippets/python/advanced/combining_all_features.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
LanguageDetectionConfig,
|
||||
TokenReductionConfig,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
language_detection=LanguageDetectionConfig(enabled=True),
|
||||
token_reduction=TokenReductionConfig(mode="moderate"),
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=512,
|
||||
max_overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"), normalize=True
|
||||
),
|
||||
),
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
quality = result.quality_score or 0
|
||||
print(f"Quality: {quality:.2f}")
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
if result.chunks:
|
||||
print(f"Chunks: {len(result.chunks)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
36
docs/snippets/python/advanced/complete_example.md
Normal file
36
docs/snippets/python/advanced/complete_example.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
PdfConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng+fra",
|
||||
tesseract_config=TesseractConfig(psm=3),
|
||||
),
|
||||
pdf_options=PdfConfig(extract_images=True),
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
|
||||
),
|
||||
),
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/advanced/embedding_with_chunking.md
Normal file
21
docs/snippets/python/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1024,
|
||||
overlap=100,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
show_download_progress=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
13
docs/snippets/python/advanced/keyword_extraction_config.md
Normal file
13
docs/snippets/python/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, KeywordConfig, KeywordAlgorithm
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
ngram_range=[1, 3],
|
||||
language="en",
|
||||
)
|
||||
)
|
||||
```
|
||||
27
docs/snippets/python/advanced/keyword_extraction_example.md
Normal file
27
docs/snippets/python/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
for keyword in result.extracted_keywords or []:
|
||||
print(f"{keyword.text}: {keyword.score:.3f}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
11
docs/snippets/python/advanced/language_detection_config.md
Normal file
11
docs/snippets/python/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False,
|
||||
)
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=True,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("multilingual_document.pdf", config=config)
|
||||
|
||||
print(f"Detected languages: {result.detected_languages}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
@@ -0,0 +1,7 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
```
|
||||
21
docs/snippets/python/advanced/quality_processing_example.md
Normal file
21
docs/snippets/python/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
|
||||
result = await extract_file("scanned_document.pdf", config=config)
|
||||
|
||||
if result.quality_score is not None:
|
||||
if result.quality_score < 0.5:
|
||||
print(f"Warning: Low quality extraction ({result.quality_score:.2f})")
|
||||
else:
|
||||
print(f"Quality score: {result.quality_score:.2f}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
12
docs/snippets/python/advanced/token_reduction_config.md
Normal file
12
docs/snippets/python/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, TokenReductionConfig, ReductionLevel
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
level=ReductionLevel.MODERATE,
|
||||
preserve_markdown=True,
|
||||
preserve_code=True,
|
||||
language_hint="eng",
|
||||
)
|
||||
)
|
||||
```
|
||||
25
docs/snippets/python/advanced/token_reduction_example.md
Normal file
25
docs/snippets/python/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
TokenReductionConfig,
|
||||
ReductionLevel,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
level=ReductionLevel.MODERATE,
|
||||
preserve_markdown=True,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("verbose_document.pdf", config=config)
|
||||
|
||||
print(f"Reduced content length: {len(result.content)} chars")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from dataclasses import dataclass, field
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VectorRecord:
|
||||
id: str
|
||||
content: str
|
||||
embedding: list[float]
|
||||
metadata: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
async def extract_and_vectorize(
|
||||
document_path: str,
|
||||
document_id: str,
|
||||
) -> list[VectorRecord]:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file(document_path, config=config)
|
||||
|
||||
records: list[VectorRecord] = []
|
||||
for index, chunk in enumerate(result.chunks or []):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append(
|
||||
VectorRecord(
|
||||
id=f"{document_id}_chunk_{index}",
|
||||
content=chunk.content,
|
||||
embedding=chunk.embedding,
|
||||
metadata={
|
||||
"document_id": document_id,
|
||||
"chunk_index": str(index),
|
||||
"content_length": str(len(chunk.content)),
|
||||
},
|
||||
)
|
||||
)
|
||||
return records
|
||||
|
||||
|
||||
asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
|
||||
```
|
||||
13
docs/snippets/python/api/batch_extract_bytes_sync.md
Normal file
13
docs/snippets/python/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig
|
||||
|
||||
items = [
|
||||
BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
|
||||
BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
|
||||
]
|
||||
|
||||
results = batch_extract_bytes_sync(items, ExtractionConfig())
|
||||
|
||||
for i, result in enumerate(results):
|
||||
print(f"Item {i}: {len(result.content)} chars extracted")
|
||||
```
|
||||
14
docs/snippets/python/api/batch_extract_files_sync.md
Normal file
14
docs/snippets/python/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
|
||||
|
||||
items = [
|
||||
BatchFileItem(path="doc1.pdf"),
|
||||
BatchFileItem(path="doc2.docx"),
|
||||
BatchFileItem(path="doc3.html"),
|
||||
]
|
||||
|
||||
results = batch_extract_files_sync(items, ExtractionConfig())
|
||||
|
||||
for i, result in enumerate(results):
|
||||
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
|
||||
```
|
||||
29
docs/snippets/python/api/client_chunk_text.md
Normal file
29
docs/snippets/python/api/client_chunk_text.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
payload = {
|
||||
"text": "Your long text content here...",
|
||||
"chunker_type": "text",
|
||||
"config": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 50,
|
||||
"trim": True,
|
||||
},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post("http://localhost:8000/chunk", json=payload)
|
||||
result = response.json()
|
||||
|
||||
print(f"Created {result['chunk_count']} chunks")
|
||||
for chunk in result["chunks"]:
|
||||
preview = chunk["content"][:50]
|
||||
print(f"Chunk {chunk['chunk_index']}: {preview}...")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
19
docs/snippets/python/api/client_extract_single_file.md
Normal file
19
docs/snippets/python/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
async with httpx.AsyncClient() as client, open("document.pdf", "rb") as f:
|
||||
response = await client.post(
|
||||
"http://localhost:8000/extract",
|
||||
files={"files": f},
|
||||
)
|
||||
data = response.json()
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
52
docs/snippets/python/api/combining_all_features.md
Normal file
52
docs/snippets/python/api/combining_all_features.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
ChunkerType,
|
||||
ImageExtractionConfig,
|
||||
OutputFormat,
|
||||
extract_file_sync,
|
||||
)
|
||||
|
||||
config = ExtractionConfig(
|
||||
# OCR: extract text from images, fallback to Tesseract
|
||||
ocr=OcrConfig(
|
||||
enabled=True,
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
),
|
||||
# Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=800,
|
||||
overlap=100,
|
||||
chunker_type=ChunkerType.Markdown,
|
||||
prepend_heading_context=True,
|
||||
),
|
||||
# Output: Markdown format with document structure preserved
|
||||
output_format=OutputFormat.Markdown,
|
||||
include_document_structure=True,
|
||||
# Images: extract embedded images
|
||||
images=ImageExtractionConfig(
|
||||
extract_images=True,
|
||||
),
|
||||
# Cache extracted results on disk
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
result = extract_file_sync("report.pdf", config=config)
|
||||
|
||||
print(f"Content ({len(result.content)} chars):")
|
||||
print(result.content[:200])
|
||||
|
||||
if result.chunks:
|
||||
print(f"\nChunks: {len(result.chunks)}")
|
||||
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
|
||||
if result.detected_languages:
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
|
||||
if result.extraction_method:
|
||||
print(f"Extraction method: {result.extraction_method}")
|
||||
```
|
||||
11
docs/snippets/python/api/error_handling.md
Normal file
11
docs/snippets/python/api/error_handling.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, KreuzbergError
|
||||
|
||||
config = ExtractionConfig()
|
||||
|
||||
try:
|
||||
result = extract_file_sync("missing.pdf", config=config)
|
||||
except KreuzbergError as e:
|
||||
print(f"Extraction failed: {e}")
|
||||
raise
|
||||
```
|
||||
27
docs/snippets/python/api/error_handling_extract.md
Normal file
27
docs/snippets/python/api/error_handling_extract.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
batch_extract_files_sync,
|
||||
BatchFileItem,
|
||||
ExtractionConfig,
|
||||
KreuzbergError,
|
||||
)
|
||||
|
||||
items = [
|
||||
BatchFileItem(path="doc1.pdf"),
|
||||
BatchFileItem(path="doc2.docx"),
|
||||
BatchFileItem(path="missing.html"),
|
||||
]
|
||||
|
||||
config = ExtractionConfig()
|
||||
|
||||
try:
|
||||
results = batch_extract_files_sync(items, config=config)
|
||||
for i, result in enumerate(results):
|
||||
if result.metadata.error:
|
||||
print(f"Document {i}: ERROR - {result.metadata.error}")
|
||||
else:
|
||||
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
|
||||
except KreuzbergError as e:
|
||||
print(f"Batch extraction failed: {e}")
|
||||
raise
|
||||
```
|
||||
14
docs/snippets/python/api/extract_bytes_async.md
Normal file
14
docs/snippets/python/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_bytes, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
with open("document.pdf", "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
|
||||
print(result.content[:200])
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
11
docs/snippets/python/api/extract_bytes_sync.md
Normal file
11
docs/snippets/python/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_bytes_sync, ExtractionConfig
|
||||
|
||||
with open("document.pdf", "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())
|
||||
|
||||
print(result.content[:200])
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
```
|
||||
12
docs/snippets/python/api/extract_file_async.md
Normal file
12
docs/snippets/python/api/extract_file_async.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
result = await extract_file("document.pdf", config=ExtractionConfig())
|
||||
print(result.content[:200])
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
print(f"Format: {result.metadata.format_type}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
9
docs/snippets/python/api/extract_file_sync.md
Normal file
9
docs/snippets/python/api/extract_file_sync.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
print(result.content[:200])
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
print(f"Format: {result.metadata.format_type}")
|
||||
```
|
||||
51
docs/snippets/python/benchmarking/simple_benchmark.py
Normal file
51
docs/snippets/python/benchmarking/simple_benchmark.py
Normal file
@@ -0,0 +1,51 @@
|
||||
```python title="simple_benchmark.py"
|
||||
import time
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from kreuzberg import Kreuzberg, ExtractionConfig
|
||||
|
||||
async def benchmark_extractions():
|
||||
config = ExtractionConfig(use_cache=False)
|
||||
kreuzberg = Kreuzberg(config)
|
||||
file_path = "document.pdf"
|
||||
num_runs = 10
|
||||
|
||||
start = time.perf_counter()
|
||||
for _ in range(num_runs):
|
||||
result = kreuzberg.extract_file(file_path)
|
||||
sync_duration = time.perf_counter() - start
|
||||
avg_sync = sync_duration / num_runs
|
||||
|
||||
print(f"Sync extraction ({num_runs} runs):")
|
||||
print(f" - Total time: {sync_duration:.3f}s")
|
||||
print(f" - Average: {avg_sync:.3f}s per extraction")
|
||||
|
||||
start = time.perf_counter()
|
||||
tasks = [kreuzberg.extract_file_async(file_path) for _ in range(num_runs)]
|
||||
await asyncio.gather(*tasks)
|
||||
async_duration = time.perf_counter() - start
|
||||
|
||||
print(f"\nAsync extraction ({num_runs} parallel runs):")
|
||||
print(f" - Total time: {async_duration:.3f}s")
|
||||
print(f" - Average: {async_duration / num_runs:.3f}s per extraction")
|
||||
print(f" - Speedup: {sync_duration / async_duration:.1f}x")
|
||||
|
||||
cache_config = ExtractionConfig(use_cache=True)
|
||||
kreuzberg_cached = Kreuzberg(cache_config)
|
||||
|
||||
print("\nFirst extraction (populates cache)...")
|
||||
start = time.perf_counter()
|
||||
result1 = await kreuzberg_cached.extract_file_async(file_path)
|
||||
first_duration = time.perf_counter() - start
|
||||
print(f" - Time: {first_duration:.3f}s")
|
||||
|
||||
print("Second extraction (from cache)...")
|
||||
start = time.perf_counter()
|
||||
result2 = await kreuzberg_cached.extract_file_async(file_path)
|
||||
cached_duration = time.perf_counter() - start
|
||||
print(f" - Time: {cached_duration:.3f}s")
|
||||
print(f" - Cache speedup: {first_duration / cached_duration:.1f}x")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(benchmark_extractions())
|
||||
```
|
||||
37
docs/snippets/python/cache/disk_cache.py
vendored
Normal file
37
docs/snippets/python/cache/disk_cache.py
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
```python title="disk_cache.py"
|
||||
from pathlib import Path
|
||||
from kreuzberg import Kreuzberg, ExtractionConfig, CacheConfig
|
||||
|
||||
cache_dir = Path.home() / ".cache" / "kreuzberg"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
cache_config=CacheConfig(
|
||||
cache_path=str(cache_dir),
|
||||
max_cache_size=500 * 1024 * 1024,
|
||||
cache_ttl_seconds=7 * 86400,
|
||||
enable_compression=True,
|
||||
),
|
||||
)
|
||||
|
||||
kreuzberg = Kreuzberg(config)
|
||||
|
||||
print("First extraction (will be cached)...")
|
||||
result1 = kreuzberg.extract_file("document.pdf")
|
||||
print(f" - Content length: {len(result1.content)}")
|
||||
print(f" - Cached: {result1.metadata.get('was_cached', False)}")
|
||||
|
||||
print("\nSecond extraction (from cache)...")
|
||||
result2 = kreuzberg.extract_file("document.pdf")
|
||||
print(f" - Content length: {len(result2.content)}")
|
||||
print(f" - Cached: {result2.metadata.get('was_cached', False)}")
|
||||
|
||||
print(f"\nResults are identical: {result1.content == result2.content}")
|
||||
|
||||
cache_stats = kreuzberg.get_cache_stats()
|
||||
print(f"\nCache Statistics:")
|
||||
print(f" - Total entries: {cache_stats.get('total_entries', 0)}")
|
||||
print(f" - Cache size: {cache_stats.get('cache_size_bytes', 0) / 1024 / 1024:.1f} MB")
|
||||
print(f" - Hit rate: {cache_stats.get('hit_rate', 0):.1%}")
|
||||
```
|
||||
28
docs/snippets/python/cli/basic_cli.py
Normal file
28
docs/snippets/python/cli/basic_cli.py
Normal file
@@ -0,0 +1,28 @@
|
||||
```python title="basic_cli.py"
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def extract_with_cli(file_path: str, output_format: str = "text") -> str:
|
||||
result = subprocess.run(
|
||||
["kreuzberg", "extract", file_path, "--format", output_format],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error: {result.stderr}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if output_format == "json":
|
||||
return json.loads(result.stdout)
|
||||
return result.stdout
|
||||
|
||||
document = "document.pdf"
|
||||
text_output = extract_with_cli(document, "text")
|
||||
print(f"Extracted: {len(text_output)} characters")
|
||||
|
||||
json_output = extract_with_cli(document, "json")
|
||||
print(f"Format: {json_output.get('format', 'unknown')}")
|
||||
```
|
||||
29
docs/snippets/python/cli/cli_with_config.py
Normal file
29
docs/snippets/python/cli/cli_with_config.py
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="cli_with_config.py"
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def extract_with_config(file_path: str, config_path: str) -> dict:
|
||||
result = subprocess.run(
|
||||
["kreuzberg", "extract", file_path, "--config", config_path, "--format", "json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error: {result.stderr}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return json.loads(result.stdout)
|
||||
|
||||
config_file = Path("kreuzberg.toml")
|
||||
document = "document.pdf"
|
||||
|
||||
print(f"Extracting {document} with config {config_file}")
|
||||
result = extract_with_config(str(document), str(config_file))
|
||||
|
||||
print(f"Content length: {len(result.get('content', ''))}")
|
||||
print(f"Format: {result.get('format')}")
|
||||
print(f"Languages: {result.get('languages')}")
|
||||
```
|
||||
60
docs/snippets/python/config/advanced_config.md
Normal file
60
docs/snippets/python/config/advanced_config.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
LanguageDetectionConfig,
|
||||
TokenReductionOptions,
|
||||
PostProcessorConfig,
|
||||
KeywordConfig,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
),
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1000,
|
||||
overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
),
|
||||
),
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False,
|
||||
),
|
||||
keywords=KeywordConfig(
|
||||
algorithm="yake",
|
||||
max_keywords=10,
|
||||
min_score=0.1,
|
||||
ngram_range=[1, 3],
|
||||
language="en",
|
||||
),
|
||||
token_reduction=TokenReductionOptions(
|
||||
mode="moderate",
|
||||
preserve_important_words=True,
|
||||
),
|
||||
postprocessor=PostProcessorConfig(enabled=True),
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
if result.detected_languages:
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
if result.chunks:
|
||||
print(f"Chunks: {len(result.chunks)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
80
docs/snippets/python/config/chunking_config.md
Normal file
80
docs/snippets/python/config/chunking_config.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Chunks: {len(result.chunks or [])}")
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Length: {len(chunk.content)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Markdown with Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
sizing_type="tokenizer",
|
||||
sizing_model="Xenova/gpt-4o",
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
heading_context = chunk.metadata.get("heading_context")
|
||||
if heading_context:
|
||||
headings = heading_context.get("headings", [])
|
||||
for h in headings:
|
||||
print(f"Heading L{h['level']}: {h['text']}")
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Semantic"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(chunker_type="semantic")
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Prepend Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
prepend_heading_context=True,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
14
docs/snippets/python/config/config_basic.md
Normal file
14
docs/snippets/python/config/config_basic.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/config/config_discover.md
Normal file
16
docs/snippets/python/config/config_discover.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig()
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
content_preview: str = content[:100]
|
||||
|
||||
print(f"Content preview: {content_preview}")
|
||||
print(f"Total length: {len(content)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/config/config_ocr.md
Normal file
16
docs/snippets/python/config/config_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract", language="eng+fra",
|
||||
tesseract_config=TesseractConfig(psm=3)
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
18
docs/snippets/python/config/config_programmatic.md
Normal file
18
docs/snippets/python/config/config_programmatic.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
ocr=OcrConfig(backend="tesseract", language="eng"),
|
||||
chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
content_length: int = len(result.content)
|
||||
print(f"Content length: {content_length}")
|
||||
```
|
||||
15
docs/snippets/python/config/document_structure_config.md
Normal file
15
docs/snippets/python/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Document Structure Config (Python)"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
# Enable document structure output
|
||||
config = ExtractionConfig(include_document_structure=True)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access the document tree
|
||||
if result.document:
|
||||
for node in result.document["nodes"]:
|
||||
node_type = node["content"]["node_type"]
|
||||
text = node["content"].get("text", "")
|
||||
print(f"[{node_type}] {text[:80]}")
|
||||
```
|
||||
29
docs/snippets/python/config/element_based_output.md
Normal file
29
docs/snippets/python/config/element_based_output.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Element-Based Output (Python)"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
# Configure element-based output
|
||||
config = ExtractionConfig(result_format="element_based")
|
||||
|
||||
# Extract document
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access elements
|
||||
for element in result.elements:
|
||||
print(f"Type: {element.element_type}")
|
||||
print(f"Text: {element.text[:100]}")
|
||||
|
||||
if element.metadata.page_number:
|
||||
print(f"Page: {element.metadata.page_number}")
|
||||
|
||||
if element.metadata.coordinates:
|
||||
coords = element.metadata.coordinates
|
||||
print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
|
||||
|
||||
print("---")
|
||||
|
||||
# Filter by element type
|
||||
titles = [e for e in result.elements if e.element_type == "title"]
|
||||
for title in titles:
|
||||
level = title.metadata.additional.get("level", "unknown")
|
||||
print(f"[{level}] {title.text}")
|
||||
```
|
||||
15
docs/snippets/python/config/embedding_config.md
Normal file
15
docs/snippets/python/config/embedding_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-mpnet-base-v2"),
|
||||
batch_size=16,
|
||||
normalize=True,
|
||||
show_download_progress=True
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
83
docs/snippets/python/config/embedding_config.py
Normal file
83
docs/snippets/python/config/embedding_config.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
|
||||
|
||||
# Example 1: Preset model (recommended)
|
||||
# Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir="~/.cache/kreuzberg/embeddings",
|
||||
)
|
||||
|
||||
# Available presets:
|
||||
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
# - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
# - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
# Example 2: Custom ONNX model (requires embeddings feature)
|
||||
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.custom(
|
||||
model_id="BAAI/bge-small-en-v1.5",
|
||||
dimensions=384,
|
||||
),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir=None, # Uses default: .kreuzberg/embeddings/
|
||||
)
|
||||
|
||||
# Popular ONNX-compatible models:
|
||||
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
# Example 3: Alternative Custom Model
|
||||
# For advanced users wanting alternative ONNX embedding models.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.custom(
|
||||
model_id="sentence-transformers/all-mpnet-base-v2",
|
||||
dimensions=768,
|
||||
),
|
||||
batch_size=16, # Larger model requires smaller batch size
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir="/var/cache/embeddings",
|
||||
)
|
||||
|
||||
|
||||
# Integration with ChunkingConfig
|
||||
# Add embeddings to your chunking configuration:
|
||||
chunking_with_embeddings = ChunkingConfig(
|
||||
max_chars=1024,
|
||||
max_overlap=100,
|
||||
preset="balanced",
|
||||
embedding=EmbeddingConfig(), # Uses balanced preset
|
||||
)
|
||||
|
||||
extraction_config = ExtractionConfig(
|
||||
chunking=chunking_with_embeddings,
|
||||
)
|
||||
|
||||
|
||||
# Key parameter explanations:
|
||||
#
|
||||
# batch_size: Number of texts to embed at once (32-128 typical)
|
||||
# - Larger batches are faster but use more memory
|
||||
# - Smaller batches for resource-constrained environments
|
||||
#
|
||||
# normalize: Whether to normalize vectors (L2 norm)
|
||||
# - True (recommended): Enables cosine similarity in vector DBs
|
||||
# - False: Raw embedding values
|
||||
#
|
||||
# cache_dir: Where to store downloaded models
|
||||
# - None: Uses .kreuzberg/embeddings/ in current directory
|
||||
# - String path: Custom directory for model storage
|
||||
#
|
||||
# show_download_progress: Display download progress bar
|
||||
# - Useful for monitoring large model downloads
|
||||
72
docs/snippets/python/config/hierarchy_config.py
Normal file
72
docs/snippets/python/config/hierarchy_config.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
|
||||
|
||||
# Example 1: Basic hierarchy extraction
|
||||
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
# Extract bounding box information for spatial layout awareness.
|
||||
hierarchy_config_basic = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6, # Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox=True, # Include bounding box coordinates
|
||||
ocr_coverage_threshold=None # No OCR coverage threshold
|
||||
)
|
||||
|
||||
pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
|
||||
extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_basic)
|
||||
|
||||
|
||||
# Example 2: Custom k_clusters for minimal structure
|
||||
# Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
# Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchy_config_minimal = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=3, # Minimal clustering: just 3 levels
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=None
|
||||
)
|
||||
|
||||
pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
|
||||
extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_minimal)
|
||||
|
||||
|
||||
# Example 3: With OCR coverage threshold
|
||||
# Trigger OCR if less than 50% of text has font data.
|
||||
# Useful for documents with mixed digital and scanned content.
|
||||
hierarchy_config_ocr = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6,
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=0.5 # Trigger OCR if text coverage < 50%
|
||||
)
|
||||
|
||||
pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
|
||||
extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_ocr)
|
||||
|
||||
|
||||
# Field descriptions:
|
||||
#
|
||||
# enabled: bool (default: True)
|
||||
# - Enable or disable hierarchy extraction
|
||||
# - When False, hierarchy structure is not analyzed
|
||||
#
|
||||
# k_clusters: int (default: 6, valid: 1-7)
|
||||
# - Number of font size clusters for hierarchy levels
|
||||
# - 6 provides H1-H6 heading levels with body text
|
||||
# - Higher values create more fine-grained hierarchy
|
||||
# - Lower values create simpler structure
|
||||
#
|
||||
# include_bbox: bool (default: True)
|
||||
# - Include bounding box coordinates in hierarchy blocks
|
||||
# - Required for spatial layout awareness and document structure
|
||||
# - Set to False only if space optimization is critical
|
||||
#
|
||||
# ocr_coverage_threshold: float | None (default: None)
|
||||
# - Range: 0.0 to 1.0
|
||||
# - Triggers OCR when text block coverage falls below this fraction
|
||||
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
# - None means no OCR coverage-based triggering
|
||||
17
docs/snippets/python/config/html_output.md
Normal file
17
docs/snippets/python/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
output_format="html",
|
||||
html_output={
|
||||
"theme": "github",
|
||||
"embed_css": True,
|
||||
},
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content) # HTML with kb-* classes and GitHub theme
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
75
docs/snippets/python/config/keyword_config.py
Normal file
75
docs/snippets/python/config/keyword_config.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
YakeParams,
|
||||
RakeParams,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
# Example 1: Basic YAKE configuration
|
||||
# Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
async def basic_yake() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.0,
|
||||
ngram_range=(1, 3),
|
||||
language="en",
|
||||
yake_params=None,
|
||||
rake_params=None,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
# Example 2: Advanced YAKE with custom parameters
|
||||
# Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
async def advanced_yake() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=15,
|
||||
min_score=0.1,
|
||||
ngram_range=(1, 2),
|
||||
language="en",
|
||||
yake_params=YakeParams(
|
||||
window_size=1,
|
||||
),
|
||||
rake_params=None,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
# Example 3: RAKE configuration
|
||||
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
async def rake_config() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.RAKE,
|
||||
max_keywords=10,
|
||||
min_score=5.0,
|
||||
ngram_range=(1, 3),
|
||||
language="en",
|
||||
yake_params=None,
|
||||
rake_params=RakeParams(
|
||||
min_word_length=1,
|
||||
max_words_per_phrase=3,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(basic_yake())
|
||||
24
docs/snippets/python/config/keyword_extraction_config.md
Normal file
24
docs/snippets/python/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
ngram_range=(1, 3),
|
||||
language="en"
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content extracted: {len(result.content)} chars")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
19
docs/snippets/python/config/language_detection_config.md
Normal file
19
docs/snippets/python/config/language_detection_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.85,
|
||||
detect_multiple=False
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
if result.detected_languages:
|
||||
print(f"Primary language: {result.detected_languages[0]}")
|
||||
print(f"Content length: {len(result.content)} chars")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
26
docs/snippets/python/config/ocr_dpi_config.md
Normal file
26
docs/snippets/python/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
ImagePreprocessingConfig,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
tesseract_config=TesseractConfig(
|
||||
preprocessing=ImagePreprocessingConfig(target_dpi=300),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
|
||||
content_length: int = len(result.content)
|
||||
table_count: int = len(result.tables)
|
||||
|
||||
print(f"Content length: {content_length} characters")
|
||||
print(f"Tables detected: {table_count}")
|
||||
```
|
||||
18
docs/snippets/python/config/pdf_config.md
Normal file
18
docs/snippets/python/config/pdf_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
extract_images=True,
|
||||
extract_metadata=True,
|
||||
passwords=["password1", "password2"],
|
||||
hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
extract_metadata=True,
|
||||
hierarchy=HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6,
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access hierarchy information
|
||||
for page in result.pages or []:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f" Content: {page.content[:100]}...")
|
||||
```
|
||||
16
docs/snippets/python/config/postprocessor_config.md
Normal file
16
docs/snippets/python/config/postprocessor_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
postprocessor=PostProcessorConfig(
|
||||
enabled=True,
|
||||
enabled_processors=["deduplication"],
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
15
docs/snippets/python/config/quality_processing_config.md
Normal file
15
docs/snippets/python/config/quality_processing_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
|
||||
quality_score: float = result.quality_score or 0.0
|
||||
print(f"Quality score: {quality_score:.2f}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/config/tesseract_config.md
Normal file
21
docs/snippets/python/config/tesseract_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
language="eng+fra+deu",
|
||||
tesseract_config=TesseractConfig(
|
||||
psm=6,
|
||||
oem=1,
|
||||
min_confidence=0.8,
|
||||
enable_table_detection=True,
|
||||
),
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
10
docs/snippets/python/config/token_reduction_config.md
Normal file
10
docs/snippets/python/config/token_reduction_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, TokenReductionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate",
|
||||
preserve_important_words=True,
|
||||
)
|
||||
)
|
||||
```
|
||||
58
docs/snippets/python/docker/usage.py
Normal file
58
docs/snippets/python/docker/usage.py
Normal file
@@ -0,0 +1,58 @@
|
||||
```python title="usage.py"
|
||||
import subprocess
|
||||
import httpx
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
class DockerKreuzbergClient:
|
||||
def __init__(self, container_name: str = "kreuzberg-api", port: int = 8000):
|
||||
self.container_name = container_name
|
||||
self.port = port
|
||||
self.api_url = f"http://localhost:{port}/api/extract"
|
||||
|
||||
def start_container(self, image: str = "kreuzberg:latest"):
|
||||
print("Starting Kreuzberg Docker container...")
|
||||
subprocess.run(
|
||||
[
|
||||
"docker", "run", "-d",
|
||||
"--name", self.container_name,
|
||||
"-p", f"{self.port}:8000",
|
||||
image,
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
print(f"Container started on http://localhost:{self.port}")
|
||||
|
||||
async def extract_file(self, file_path: str) -> str:
|
||||
file_bytes = Path(file_path).read_bytes()
|
||||
files = {"file": (Path(file_path).name, file_bytes)}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(self.api_url, files=files)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result.get("content", "")
|
||||
|
||||
def stop_container(self):
|
||||
print("Stopping Kreuzberg Docker container...")
|
||||
subprocess.run(["docker", "stop", self.container_name], check=True)
|
||||
subprocess.run(["docker", "rm", self.container_name], check=True)
|
||||
print("Container stopped and removed")
|
||||
|
||||
async def main():
|
||||
docker_client = DockerKreuzbergClient()
|
||||
|
||||
try:
|
||||
docker_client.start_container()
|
||||
import asyncio
|
||||
await asyncio.sleep(2)
|
||||
|
||||
content = await docker_client.extract_file("document.pdf")
|
||||
print(f"Extracted content:\n{content}")
|
||||
finally:
|
||||
docker_client.stop_container()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
14
docs/snippets/python/getting-started/basic_usage.md
Normal file
14
docs/snippets/python/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/getting-started/extract_file.md
Normal file
16
docs/snippets/python/getting-started/extract_file.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from kreuzberg import extract_file
|
||||
|
||||
async def main() -> None:
|
||||
file_path: Path = Path("document.pdf")
|
||||
|
||||
result = await extract_file(file_path)
|
||||
|
||||
print(f"Content: {result.content}")
|
||||
print(f"Format: {result.metadata.format.format_type if result.metadata.format else None}")
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
19
docs/snippets/python/getting-started/extract_with_ocr.md
Normal file
19
docs/snippets/python/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, TesseractConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
force_ocr=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
tesseract_config=TesseractConfig(psm=3)
|
||||
)
|
||||
)
|
||||
result = await extract_file("scanned.pdf", config=config)
|
||||
print(result.content)
|
||||
print(f"Detected Languages: {result.detected_languages}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
10
docs/snippets/python/getting-started/hello_world.md
Normal file
10
docs/snippets/python/getting-started/hello_world.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file
|
||||
|
||||
async def main() -> None:
|
||||
result = await extract_file("document.pdf")
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
12
docs/snippets/python/getting-started/install_verify.md
Normal file
12
docs/snippets/python/getting-started/install_verify.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, __version__
|
||||
|
||||
async def main() -> None:
|
||||
print(f"Kreuzberg version: {__version__}")
|
||||
|
||||
result = await extract_file("document.pdf")
|
||||
print(f"Extraction successful: {len(result.content) > 0}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
17
docs/snippets/python/getting-started/read_content.md
Normal file
17
docs/snippets/python/getting-started/read_content.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file
|
||||
|
||||
async def main() -> None:
|
||||
result = await extract_file("document.pdf")
|
||||
|
||||
content: str = result.content
|
||||
tables: int = len(result.tables)
|
||||
format_type: str | None = result.metadata.format.format_type if result.metadata.format else None
|
||||
|
||||
print(f"Content length: {len(content)} characters")
|
||||
print(f"Tables found: {tables}")
|
||||
print(f"Format: {format_type}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
27
docs/snippets/python/llm/structured_extraction.md
Normal file
27
docs/snippets/python/llm/structured_extraction.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, StructuredExtractionConfig, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
structured_extraction=StructuredExtractionConfig(
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"authors": {"type": "array", "items": {"type": "string"}},
|
||||
"date": {"type": "string"},
|
||||
},
|
||||
"required": ["title", "authors", "date"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
llm=LlmConfig(model="openai/gpt-4o-mini"),
|
||||
strict=True,
|
||||
),
|
||||
)
|
||||
result = await extract_file("paper.pdf", config=config)
|
||||
print(result.structured_output)
|
||||
# {"title": "...", "authors": ["..."], "date": "..."}
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/llm/vlm_embeddings.md
Normal file
16
docs/snippets/python/llm/vlm_embeddings.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import embed, EmbeddingConfig, EmbeddingModelType, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.llm(
|
||||
LlmConfig(model="openai/text-embedding-3-small")
|
||||
),
|
||||
normalize=True,
|
||||
)
|
||||
embeddings = await embed(["Hello world"], config=config)
|
||||
print(len(embeddings[0])) # 1536
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
17
docs/snippets/python/llm/vlm_ocr.md
Normal file
17
docs/snippets/python/llm/vlm_ocr.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
force_ocr=True,
|
||||
ocr=OcrConfig(
|
||||
backend="vlm",
|
||||
vlm_config=LlmConfig(model="openai/gpt-4o-mini"),
|
||||
),
|
||||
)
|
||||
result = await extract_file("scan.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
23
docs/snippets/python/mcp/mcp_custom_client.md
Normal file
23
docs/snippets/python/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from mcp import ClientSession, StdioServerParameters
|
||||
from mcp.client.stdio import stdio_client
|
||||
|
||||
async def main() -> None:
|
||||
server_params: StdioServerParameters = StdioServerParameters(
|
||||
command="kreuzberg", args=["mcp"]
|
||||
)
|
||||
|
||||
async with stdio_client(server_params) as (read, write):
|
||||
async with ClientSession(read, write) as session:
|
||||
await session.initialize()
|
||||
tools = await session.list_tools()
|
||||
tool_names: list[str] = [t.name for t in tools.tools]
|
||||
print(f"Available tools: {tool_names}")
|
||||
result = await session.call_tool(
|
||||
"extract_file", arguments={"path": "document.pdf", "async": True}
|
||||
)
|
||||
print(result)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
36
docs/snippets/python/mcp/mcp_langchain_integration.md
Normal file
36
docs/snippets/python/mcp/mcp_langchain_integration.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```python title="Python"
|
||||
from langchain.agents import initialize_agent, AgentType
|
||||
from langchain.tools import Tool
|
||||
from langchain_openai import ChatOpenAI
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
mcp_process = subprocess.Popen(
|
||||
["kreuzberg", "mcp"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
def extract_file(path: str) -> str:
|
||||
request: dict = {
|
||||
"method": "tools/call",
|
||||
"params": {
|
||||
"name": "extract_file",
|
||||
"arguments": {"path": path, "async": True},
|
||||
},
|
||||
}
|
||||
mcp_process.stdin.write(json.dumps(request).encode() + b"\n")
|
||||
mcp_process.stdin.flush()
|
||||
response = mcp_process.stdout.readline()
|
||||
return json.loads(response)["result"]["content"]
|
||||
|
||||
tools: list[Tool] = [
|
||||
Tool(name="extract_document", func=extract_file, description="Extract")
|
||||
]
|
||||
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
agent = initialize_agent(
|
||||
tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
|
||||
)
|
||||
```
|
||||
17
docs/snippets/python/mcp/mcp_server_start.md
Normal file
17
docs/snippets/python/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
mcp_process: subprocess.Popen = subprocess.Popen(
|
||||
["python", "-m", "kreuzberg", "mcp"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
pid: Optional[int] = mcp_process.pid
|
||||
print(f"MCP server started with PID: {pid}")
|
||||
|
||||
time.sleep(1)
|
||||
print("Server is running, listening for connections")
|
||||
```
|
||||
13
docs/snippets/python/metadata/language_detection.md
Normal file
13
docs/snippets/python/metadata/language_detection.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.9,
|
||||
detect_multiple=True,
|
||||
),
|
||||
)
|
||||
|
||||
print(config.language_detection)
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=True,
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("multilingual_document.pdf", config=config)
|
||||
|
||||
if result.detected_languages:
|
||||
print(f"Detected languages: {', '.join(result.detected_languages)}")
|
||||
```
|
||||
80
docs/snippets/python/metadata/metadata.md
Normal file
80
docs/snippets/python/metadata/metadata.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
if metadata.get("page_count"):
|
||||
print(f"Pages: {metadata['page_count']}")
|
||||
if metadata.get("title"):
|
||||
print(f"Title: {metadata['title']}")
|
||||
if metadata.get("authors"):
|
||||
print(f"Authors: {', '.join(metadata['authors'])}")
|
||||
|
||||
result = extract_file_sync("page.html", config=ExtractionConfig())
|
||||
metadata = result.metadata
|
||||
if metadata.get("title"):
|
||||
print(f"Title: {metadata['title']}")
|
||||
if metadata.get("description"):
|
||||
print(f"Description: {metadata['description']}")
|
||||
|
||||
# Access keywords as array
|
||||
keywords = metadata.get('keywords', [])
|
||||
if keywords:
|
||||
print(f"Keywords: {', '.join(keywords)}")
|
||||
|
||||
# Access canonical URL (renamed from canonical)
|
||||
canonical_url = metadata.get('canonical_url')
|
||||
if canonical_url:
|
||||
print(f"Canonical URL: {canonical_url}")
|
||||
|
||||
# Access Open Graph fields from map
|
||||
open_graph = metadata.get('open_graph', {})
|
||||
if open_graph:
|
||||
if 'image' in open_graph:
|
||||
print(f"Open Graph Image: {open_graph['image']}")
|
||||
if 'title' in open_graph:
|
||||
print(f"Open Graph Title: {open_graph['title']}")
|
||||
if 'type' in open_graph:
|
||||
print(f"Open Graph Type: {open_graph['type']}")
|
||||
|
||||
# Access Twitter Card fields from map
|
||||
twitter_card = metadata.get('twitter_card', {})
|
||||
if twitter_card:
|
||||
if 'card' in twitter_card:
|
||||
print(f"Twitter Card Type: {twitter_card['card']}")
|
||||
if 'creator' in twitter_card:
|
||||
print(f"Twitter Creator: {twitter_card['creator']}")
|
||||
|
||||
# Access new fields
|
||||
language = metadata.get('language')
|
||||
if language:
|
||||
print(f"Language: {language}")
|
||||
|
||||
text_direction = metadata.get('text_direction')
|
||||
if text_direction:
|
||||
print(f"Text Direction: {text_direction}")
|
||||
|
||||
# Access headers
|
||||
headers = metadata.get('headers', [])
|
||||
if headers:
|
||||
print(f"Headers: {', '.join([h['text'] for h in headers])}")
|
||||
|
||||
# Access links
|
||||
links = metadata.get('links', [])
|
||||
if links:
|
||||
for link in links:
|
||||
print(f"Link: {link.get('href')} ({link.get('text')})")
|
||||
|
||||
# Access images
|
||||
images = metadata.get('images', [])
|
||||
if images:
|
||||
for image in images:
|
||||
print(f"Image: {image.get('src')}")
|
||||
|
||||
# Access structured data
|
||||
structured_data = metadata.get('structured_data', [])
|
||||
if structured_data:
|
||||
print(f"Structured data items: {len(structured_data)}")
|
||||
```
|
||||
17
docs/snippets/python/metadata/page_boundaries.md
Normal file
17
docs/snippets/python/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
if result.metadata.pages and result.metadata.pages.boundaries:
|
||||
boundaries = result.metadata.pages.boundaries
|
||||
content_bytes = result.content.encode("utf-8")
|
||||
|
||||
for boundary in boundaries[:3]:
|
||||
page_bytes = content_bytes[boundary.byte_start:boundary.byte_end]
|
||||
page_text = page_bytes.decode("utf-8")
|
||||
|
||||
print(f"Page {boundary.page_number}:")
|
||||
print(f" Byte range: {boundary.byte_start}-{boundary.byte_end}")
|
||||
print(f" Preview: {page_text[:100]}...")
|
||||
```
|
||||
14
docs/snippets/python/metadata/page_tracking_basic.md
Normal file
14
docs/snippets/python/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,14 @@
|
||||
From Kreuzberg import extract_file_sync, ExtractionConfig, PageConfig
|
||||
|
||||
Config = ExtractionConfig(
|
||||
pages=PageConfig(extract_pages=True)
|
||||
)
|
||||
|
||||
Result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
If result.pages:
|
||||
for page in result.pages:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f" Content: {len(page.content)} chars")
|
||||
print(f" Tables: {len(page.tables)}")
|
||||
print(f" Images: {len(page.images)}")
|
||||
39
docs/snippets/python/metadata/pdf_metadata_extractor.md
Normal file
39
docs/snippets/python/metadata/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfMetadataExtractor:
|
||||
def __init__(self):
|
||||
self.processed_count: int = 0
|
||||
|
||||
def name(self) -> str:
|
||||
return "pdf_metadata_extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def description(self) -> str:
|
||||
return "Extracts and enriches PDF metadata"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
self.processed_count += 1
|
||||
result.metadata["pdf_processed"] = True
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info("PDF metadata extractor initialized")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Processed {self.processed_count} PDFs")
|
||||
|
||||
processor: PdfMetadataExtractor = PdfMetadataExtractor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
preset="balanced",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
records: list[dict] = []
|
||||
if result.chunks:
|
||||
for index, chunk in enumerate(result.chunks):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append({
|
||||
"id": f"document_chunk_{index}",
|
||||
"content": chunk.content,
|
||||
"embedding": chunk.embedding,
|
||||
"metadata": {
|
||||
"document_id": "document.pdf",
|
||||
"chunk_index": index,
|
||||
"content_length": len(chunk.content),
|
||||
},
|
||||
})
|
||||
|
||||
print(f"Prepared {len(records)} vector records")
|
||||
```
|
||||
37
docs/snippets/python/ocr/cloud_ocr_backend.md
Normal file
37
docs/snippets/python/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_ocr_backend
|
||||
import httpx
|
||||
|
||||
class CloudOcrBackend:
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key: str = api_key
|
||||
self.langs: list[str] = ["eng", "deu", "fra"]
|
||||
|
||||
def name(self) -> str:
|
||||
return "cloud-ocr"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def supported_languages(self) -> list[str]:
|
||||
return self.langs
|
||||
|
||||
def process_image(self, image_bytes: bytes, config: dict) -> dict:
|
||||
with httpx.Client() as client:
|
||||
response = client.post(
|
||||
"https://api.example.com/ocr",
|
||||
files={"image": image_bytes},
|
||||
json={"language": config.get("language", "eng")},
|
||||
)
|
||||
text: str = response.json()["text"]
|
||||
return {"content": text, "mime_type": "text/plain"}
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
backend: CloudOcrBackend = CloudOcrBackend(api_key="your-api-key")
|
||||
register_ocr_backend(backend)
|
||||
```
|
||||
17
docs/snippets/python/ocr/image_extraction.md
Normal file
17
docs/snippets/python/ocr/image_extraction.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ImageExtractionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
images=ImageExtractionConfig(
|
||||
extract_images=True,
|
||||
target_dpi=200,
|
||||
max_image_dimension=2048,
|
||||
inject_placeholders=True, # set to False to extract images without markdown references
|
||||
auto_adjust_dpi=True,
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
print(f"Content length: {len(result.content)} characters")
|
||||
```
|
||||
29
docs/snippets/python/ocr/image_preprocessing.md
Normal file
29
docs/snippets/python/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
ImagePreprocessingConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
)
|
||||
|
||||
preprocessing: ImagePreprocessingConfig = ImagePreprocessingConfig(
|
||||
target_dpi=300,
|
||||
denoise=True,
|
||||
deskew=True,
|
||||
contrast_enhance=True,
|
||||
binarization_method="otsu",
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
tesseract_config=TesseractConfig(preprocessing=preprocessing),
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
print(f"Content length: {len(result.content)} characters")
|
||||
```
|
||||
18
docs/snippets/python/ocr/ocr_easyocr.md
Normal file
18
docs/snippets/python/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="easyocr", language="en")
|
||||
)
|
||||
|
||||
# EasyOCR-specific options (use_gpu, beam_width, etc.) go in easyocr_kwargs,
|
||||
# not in OcrConfig — OcrConfig only accepts backend, language, and backend-specific configs.
|
||||
result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True})
|
||||
|
||||
content: str = result.content
|
||||
preview: str = content[:100]
|
||||
total_length: int = len(content)
|
||||
|
||||
print(f"Extracted content (preview): {preview}")
|
||||
print(f"Total characters: {total_length}")
|
||||
```
|
||||
18
docs/snippets/python/ocr/ocr_elements.md
Normal file
18
docs/snippets/python/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="paddleocr", language="en")
|
||||
)
|
||||
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
|
||||
if result.ocr_elements:
|
||||
for element in result.ocr_elements:
|
||||
print(f"Text: {element.text}")
|
||||
print(f"Confidence: {element.confidence.recognition:.2f}")
|
||||
print(f"Geometry: {element.geometry}")
|
||||
if element.rotation:
|
||||
print(f"Rotation: {element.rotation.angle}°")
|
||||
print()
|
||||
```
|
||||
16
docs/snippets/python/ocr/ocr_extraction.md
Normal file
16
docs/snippets/python/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="tesseract", language="eng")
|
||||
)
|
||||
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
preview: str = content[:100]
|
||||
total_length: int = len(content)
|
||||
|
||||
print(f"Extracted content (preview): {preview}")
|
||||
print(f"Total characters: {total_length}")
|
||||
```
|
||||
17
docs/snippets/python/ocr/ocr_force_all_pages.md
Normal file
17
docs/snippets/python/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="tesseract"),
|
||||
force_ocr=True,
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
preview: str = content[:100]
|
||||
total_length: int = len(content)
|
||||
|
||||
print(f"Extracted content (preview): {preview}")
|
||||
print(f"Total characters: {total_length}")
|
||||
```
|
||||
16
docs/snippets/python/ocr/ocr_multi_language.md
Normal file
16
docs/snippets/python/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="tesseract", language="eng+deu+fra")
|
||||
)
|
||||
|
||||
result = extract_file_sync("multilingual.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
preview: str = content[:100]
|
||||
total_length: int = len(content)
|
||||
|
||||
print(f"Extracted content (preview): {preview}")
|
||||
print(f"Total characters: {total_length}")
|
||||
```
|
||||
16
docs/snippets/python/ocr/ocr_paddleocr.md
Normal file
16
docs/snippets/python/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="paddleocr", language="en") # model_tier="server" for max accuracy
|
||||
)
|
||||
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
preview: str = content[:100]
|
||||
total_length: int = len(content)
|
||||
|
||||
print(f"Extracted content (preview): {preview}")
|
||||
print(f"Total characters: {total_length}")
|
||||
```
|
||||
15
docs/snippets/python/plugins/clear_plugins.md
Normal file
15
docs/snippets/python/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
clear_document_extractors,
|
||||
clear_post_processors,
|
||||
clear_ocr_backends,
|
||||
clear_validators,
|
||||
)
|
||||
|
||||
clear_post_processors()
|
||||
clear_validators()
|
||||
clear_ocr_backends()
|
||||
clear_document_extractors()
|
||||
|
||||
print("All plugins cleared")
|
||||
```
|
||||
44
docs/snippets/python/plugins/embedding_backend.md
Normal file
44
docs/snippets/python/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_embedding_backend, EmbeddingConfig, embed_texts
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Wrap an already-loaded embedder (e.g. sentence-transformers, llama-cpp-python,
|
||||
# or a tuned ONNX session) so kreuzberg can call back into it during chunking
|
||||
# and standalone embed requests.
|
||||
class MyEmbedder:
|
||||
def __init__(self):
|
||||
self._model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
||||
|
||||
# Plugin trait hooks
|
||||
def name(self) -> str:
|
||||
return "my-embedder"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def initialize(self) -> None:
|
||||
# Optional warm-up; runs once at registration before dimensions() is cached.
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
# EmbeddingBackend hooks
|
||||
def dimensions(self) -> int:
|
||||
# Captured once at registration; the dispatcher uses this for shape validation.
|
||||
return self._model.get_sentence_embedding_dimension()
|
||||
|
||||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||||
return self._model.encode(texts).tolist()
|
||||
|
||||
|
||||
# Register once at startup. Reference by name in config.
|
||||
register_embedding_backend(MyEmbedder())
|
||||
|
||||
config: EmbeddingConfig = {
|
||||
"model": {"type": "plugin", "name": "my-embedder"},
|
||||
# Optional: bound the wait on a hung backend (default: 60s; None disables)
|
||||
"max_embed_duration_secs": 30,
|
||||
}
|
||||
vectors = embed_texts(["Hello, world!", "Second text"], config)
|
||||
```
|
||||
14
docs/snippets/python/plugins/extractor_registration.md
Normal file
14
docs/snippets/python/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_document_extractor
|
||||
|
||||
class CustomExtractor:
|
||||
def name(self) -> str:
|
||||
return "custom"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
extractor = CustomExtractor()
|
||||
register_document_extractor(extractor)
|
||||
print("Extractor registered")
|
||||
```
|
||||
18
docs/snippets/python/plugins/list_plugins.md
Normal file
18
docs/snippets/python/plugins/list_plugins.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
list_document_extractors,
|
||||
list_post_processors,
|
||||
list_ocr_backends,
|
||||
list_validators,
|
||||
)
|
||||
|
||||
extractors: list[str] = list_document_extractors()
|
||||
processors: list[str] = list_post_processors()
|
||||
ocr_backends: list[str] = list_ocr_backends()
|
||||
validators: list[str] = list_validators()
|
||||
|
||||
print(f"Extractors: {extractors}")
|
||||
print(f"Processors: {processors}")
|
||||
print(f"OCR backends: {ocr_backends}")
|
||||
print(f"Validators: {validators}")
|
||||
```
|
||||
33
docs/snippets/python/plugins/min_length_validator.md
Normal file
33
docs/snippets/python/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_validator, ExtractionResult, ValidationError
|
||||
|
||||
class MinLengthValidator:
|
||||
def __init__(self, min_length: int = 100):
|
||||
self.min_length: int = min_length
|
||||
|
||||
def name(self) -> str:
|
||||
return "min_length_validator"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def priority(self) -> int:
|
||||
return 100
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
content_len: int = len(result.content)
|
||||
if content_len < self.min_length:
|
||||
raise ValidationError(f"Content too short: {content_len}")
|
||||
|
||||
def should_validate(self, result: ExtractionResult) -> bool:
|
||||
return True
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
validator: MinLengthValidator = MinLengthValidator(min_length=100)
|
||||
register_validator(validator)
|
||||
```
|
||||
42
docs/snippets/python/plugins/pdf_metadata_extractor.md
Normal file
42
docs/snippets/python/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfMetadataExtractor:
|
||||
def __init__(self):
|
||||
self.processed_count: int = 0
|
||||
|
||||
def name(self) -> str:
|
||||
return "pdf_metadata_extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def description(self) -> str:
|
||||
return "Extracts and enriches PDF metadata"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
self.processed_count += 1
|
||||
result.metadata["pdf_processed"] = True
|
||||
result.metadata["pdf_order"] = self.processed_count
|
||||
result.metadata["content_length"] = len(result.content)
|
||||
result.metadata["pdf_processor_version"] = "1.0.0"
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info("PDF metadata extractor initialized")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Processed {self.processed_count} PDFs")
|
||||
|
||||
processor: PdfMetadataExtractor = PdfMetadataExtractor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
19
docs/snippets/python/plugins/pdf_only_processor.md
Normal file
19
docs/snippets/python/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionResult, register_post_processor
|
||||
|
||||
class PdfOnlyProcessor:
|
||||
def name(self) -> str:
|
||||
return "pdf-only-processor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
return result
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
processor: PdfOnlyProcessor = PdfOnlyProcessor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_document_extractor, ExtractionResult
|
||||
import json
|
||||
|
||||
class CustomJsonExtractor:
|
||||
def name(self) -> str:
|
||||
return "custom-json-extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def supported_mime_types(self) -> list[str]:
|
||||
return ["application/json"]
|
||||
|
||||
def priority(self) -> int:
|
||||
return 50
|
||||
|
||||
def extract_bytes(
|
||||
self, content: bytes, mime_type: str, config: dict
|
||||
) -> ExtractionResult:
|
||||
data: dict = json.loads(content)
|
||||
text: str = self._extract_text(data)
|
||||
return {"content": text, "mime_type": "application/json"}
|
||||
|
||||
def _extract_text(self, obj: object) -> str:
|
||||
if isinstance(obj, str):
|
||||
return f"{obj}\n"
|
||||
if isinstance(obj, list):
|
||||
return "".join(self._extract_text(item) for item in obj)
|
||||
if isinstance(obj, dict):
|
||||
return "".join(self._extract_text(v) for v in obj.values())
|
||||
return ""
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
extractor: CustomJsonExtractor = CustomJsonExtractor()
|
||||
register_document_extractor(extractor)
|
||||
```
|
||||
27
docs/snippets/python/plugins/plugin_logging.md
Normal file
27
docs/snippets/python/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MyPlugin:
|
||||
def name(self) -> str:
|
||||
return "my-plugin"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info(f"Initializing plugin: {self.name()}")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Shutting down plugin: {self.name()}")
|
||||
|
||||
def extract_bytes(
|
||||
self, content: bytes, mime_type: str, config: dict
|
||||
) -> dict:
|
||||
logger.info(f"Extracting {mime_type} ({len(content)} bytes)")
|
||||
result: dict = {"content": "", "mime_type": mime_type}
|
||||
if not result["content"]:
|
||||
logger.warning("Extraction resulted in empty content")
|
||||
return result
|
||||
```
|
||||
14
docs/snippets/python/plugins/plugin_testing.md
Normal file
14
docs/snippets/python/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import pytest
|
||||
from kreuzberg import ExtractionResult
|
||||
|
||||
def test_custom_extractor() -> None:
|
||||
extractor = CustomJsonExtractor()
|
||||
json_data: bytes = b'{"message": "Hello, world!"}'
|
||||
config: dict = {}
|
||||
result: ExtractionResult = extractor.extract_bytes(
|
||||
json_data, "application/json", config
|
||||
)
|
||||
assert "Hello, world!" in result.content
|
||||
assert result.mime_type == "application/json"
|
||||
```
|
||||
34
docs/snippets/python/plugins/plugin_validator.md
Normal file
34
docs/snippets/python/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionResult,
|
||||
ValidationError,
|
||||
extract_file_sync,
|
||||
register_validator,
|
||||
)
|
||||
|
||||
class MinLengthValidator:
|
||||
def name(self) -> str:
|
||||
return "min_length"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
if len(result.content) < 50:
|
||||
raise ValidationError(f"Content too short: {len(result.content)}")
|
||||
|
||||
def should_validate(self, result: ExtractionResult) -> bool:
|
||||
return True
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
validator: MinLengthValidator = MinLengthValidator()
|
||||
register_validator(validator)
|
||||
|
||||
result = extract_file_sync("document.pdf")
|
||||
print(f"Content length: {len(result.content)}")
|
||||
```
|
||||
20
docs/snippets/python/plugins/quality_score_validator.md
Normal file
20
docs/snippets/python/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionResult, ValidationError, register_validator
|
||||
|
||||
class QualityValidator:
|
||||
def name(self) -> str:
|
||||
return "quality-validator"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
score: float = result.metadata.get("quality_score", 0.0)
|
||||
if score < 0.5:
|
||||
raise ValidationError(
|
||||
f"Quality score too low: {score:.2f}"
|
||||
)
|
||||
|
||||
validator: QualityValidator = QualityValidator()
|
||||
register_validator(validator)
|
||||
```
|
||||
28
docs/snippets/python/plugins/stateful_plugin.md
Normal file
28
docs/snippets/python/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```python title="Python"
|
||||
import threading
|
||||
from kreuzberg import ExtractionResult
|
||||
|
||||
class StatefulPlugin:
|
||||
def __init__(self):
|
||||
self.lock: threading.Lock = threading.Lock()
|
||||
self.call_count: int = 0
|
||||
self.cache: dict = {}
|
||||
|
||||
def name(self) -> str:
|
||||
return "stateful-plugin"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
with self.lock:
|
||||
self.call_count += 1
|
||||
self.cache["last_mime"] = result.mime_type
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
```
|
||||
20
docs/snippets/python/plugins/unregister_plugins.md
Normal file
20
docs/snippets/python/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
unregister_document_extractor,
|
||||
unregister_post_processor,
|
||||
unregister_ocr_backend,
|
||||
unregister_validator,
|
||||
)
|
||||
|
||||
names: list[str] = [
|
||||
"custom-json-extractor",
|
||||
"word_count",
|
||||
"cloud-ocr",
|
||||
"min_length_validator",
|
||||
]
|
||||
|
||||
unregister_document_extractor(names[0])
|
||||
unregister_post_processor(names[1])
|
||||
unregister_ocr_backend(names[2])
|
||||
unregister_validator(names[3])
|
||||
```
|
||||
30
docs/snippets/python/plugins/word_count_processor.md
Normal file
30
docs/snippets/python/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
|
||||
class WordCountProcessor:
|
||||
def name(self) -> str:
|
||||
return "word_count"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
word_count: int = len(result.content.split())
|
||||
result.metadata["word_count"] = word_count
|
||||
return result
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return bool(result.content)
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
processor: WordCountProcessor = WordCountProcessor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
18
docs/snippets/python/utils/chunking.md
Normal file
18
docs/snippets/python/utils/chunking.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1500,
|
||||
max_overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-minilm-l6-v2")
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
36
docs/snippets/python/utils/chunking_rag.md
Normal file
36
docs/snippets/python/utils/chunking_rag.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
normalize=True,
|
||||
batch_size=16
|
||||
)
|
||||
)
|
||||
)
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
chunks_with_embeddings: list = []
|
||||
for chunk in result.chunks or []:
|
||||
if chunk.embedding:
|
||||
chunks_with_embeddings.append({
|
||||
"content": chunk.content[:100],
|
||||
"embedding_dims": len(chunk.embedding)
|
||||
})
|
||||
|
||||
print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/utils/embedding_with_chunking.md
Normal file
21
docs/snippets/python/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1024,
|
||||
max_overlap=100,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
show_download_progress=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user