Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig
items = [
BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
]
results = batch_extract_bytes_sync(items, ExtractionConfig())
for i, result in enumerate(results):
print(f"Item {i}: {len(result.content)} chars extracted")
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
items = [
BatchFileItem(path="doc1.pdf"),
BatchFileItem(path="doc2.docx"),
BatchFileItem(path="doc3.html"),
]
results = batch_extract_files_sync(items, ExtractionConfig())
for i, result in enumerate(results):
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
```

View File

@@ -0,0 +1,29 @@
```python title="Python"
import asyncio
import httpx
async def main() -> None:
payload = {
"text": "Your long text content here...",
"chunker_type": "text",
"config": {
"max_characters": 1000,
"overlap": 50,
"trim": True,
},
}
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/chunk", json=payload)
result = response.json()
print(f"Created {result['chunk_count']} chunks")
for chunk in result["chunks"]:
preview = chunk["content"][:50]
print(f"Chunk {chunk['chunk_index']}: {preview}...")
asyncio.run(main())
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
import asyncio
import json
import httpx
async def main() -> None:
async with httpx.AsyncClient() as client, open("document.pdf", "rb") as f:
response = await client.post(
"http://localhost:8000/extract",
files={"files": f},
)
data = response.json()
print(json.dumps(data, indent=2))
asyncio.run(main())
```

View File

@@ -0,0 +1,52 @@
```python title="Python"
from kreuzberg import (
ExtractionConfig,
OcrConfig,
ChunkingConfig,
ChunkerType,
ImageExtractionConfig,
OutputFormat,
extract_file_sync,
)
config = ExtractionConfig(
# OCR: extract text from images, fallback to Tesseract
ocr=OcrConfig(
enabled=True,
backend="tesseract",
language="eng",
),
# Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
chunking=ChunkingConfig(
max_characters=800,
overlap=100,
chunker_type=ChunkerType.Markdown,
prepend_heading_context=True,
),
# Output: Markdown format with document structure preserved
output_format=OutputFormat.Markdown,
include_document_structure=True,
# Images: extract embedded images
images=ImageExtractionConfig(
extract_images=True,
),
# Cache extracted results on disk
use_cache=True,
)
result = extract_file_sync("report.pdf", config=config)
print(f"Content ({len(result.content)} chars):")
print(result.content[:200])
if result.chunks:
print(f"\nChunks: {len(result.chunks)}")
print(f"Tables: {len(result.tables)}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
if result.extraction_method:
print(f"Extraction method: {result.extraction_method}")
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, KreuzbergError
config = ExtractionConfig()
try:
result = extract_file_sync("missing.pdf", config=config)
except KreuzbergError as e:
print(f"Extraction failed: {e}")
raise
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
from kreuzberg import (
batch_extract_files_sync,
BatchFileItem,
ExtractionConfig,
KreuzbergError,
)
items = [
BatchFileItem(path="doc1.pdf"),
BatchFileItem(path="doc2.docx"),
BatchFileItem(path="missing.html"),
]
config = ExtractionConfig()
try:
results = batch_extract_files_sync(items, config=config)
for i, result in enumerate(results):
if result.metadata.error:
print(f"Document {i}: ERROR - {result.metadata.error}")
else:
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
except KreuzbergError as e:
print(f"Batch extraction failed: {e}")
raise
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import asyncio
from kreuzberg import extract_bytes, ExtractionConfig
async def main() -> None:
with open("document.pdf", "rb") as f:
content = f.read()
result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
asyncio.run(main())
```

View File

@@ -0,0 +1,11 @@
```python title="Python"
from kreuzberg import extract_bytes_sync, ExtractionConfig
with open("document.pdf", "rb") as f:
content = f.read()
result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
```

View File

@@ -0,0 +1,12 @@
```python title="Python"
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
result = await extract_file("document.pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
asyncio.run(main())
```

View File

@@ -0,0 +1,9 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
```