Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/python/api/batch_extract_bytes_sync.md
@@ -0,0 +1,13 @@
+```python title="Python"
+from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig
+
+items = [
+    BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
+    BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
+]
+
+results = batch_extract_bytes_sync(items, ExtractionConfig())
+
+for i, result in enumerate(results):
+    print(f"Item {i}: {len(result.content)} chars extracted")
+```
--- a/docs/snippets/python/api/batch_extract_files_sync.md
+++ b/docs/snippets/python/api/batch_extract_files_sync.md
@@ -0,0 +1,14 @@
+```python title="Python"
+from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
+
+items = [
+    BatchFileItem(path="doc1.pdf"),
+    BatchFileItem(path="doc2.docx"),
+    BatchFileItem(path="doc3.html"),
+]
+
+results = batch_extract_files_sync(items, ExtractionConfig())
+
+for i, result in enumerate(results):
+    print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
+```
--- a/docs/snippets/python/api/client_chunk_text.md
+++ b/docs/snippets/python/api/client_chunk_text.md
@@ -0,0 +1,29 @@
+```python title="Python"
+import asyncio
+
+import httpx
+
+
+async def main() -> None:
+    payload = {
+        "text": "Your long text content here...",
+        "chunker_type": "text",
+        "config": {
+            "max_characters": 1000,
+            "overlap": 50,
+            "trim": True,
+        },
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post("http://localhost:8000/chunk", json=payload)
+    result = response.json()
+
+    print(f"Created {result['chunk_count']} chunks")
+    for chunk in result["chunks"]:
+        preview = chunk["content"][:50]
+        print(f"Chunk {chunk['chunk_index']}: {preview}...")
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/client_extract_single_file.md
+++ b/docs/snippets/python/api/client_extract_single_file.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+import json
+
+import httpx
+
+
+async def main() -> None:
+    async with httpx.AsyncClient() as client, open("document.pdf", "rb") as f:
+        response = await client.post(
+            "http://localhost:8000/extract",
+            files={"files": f},
+        )
+    data = response.json()
+    print(json.dumps(data, indent=2))
+
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/combining_all_features.md
+++ b/docs/snippets/python/api/combining_all_features.md
@@ -0,0 +1,52 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    ChunkerType,
+    ImageExtractionConfig,
+    OutputFormat,
+    extract_file_sync,
+)
+
+config = ExtractionConfig(
+    # OCR: extract text from images, fallback to Tesseract
+    ocr=OcrConfig(
+        enabled=True,
+        backend="tesseract",
+        language="eng",
+    ),
+    # Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
+    chunking=ChunkingConfig(
+        max_characters=800,
+        overlap=100,
+        chunker_type=ChunkerType.Markdown,
+        prepend_heading_context=True,
+    ),
+    # Output: Markdown format with document structure preserved
+    output_format=OutputFormat.Markdown,
+    include_document_structure=True,
+    # Images: extract embedded images
+    images=ImageExtractionConfig(
+        extract_images=True,
+    ),
+    # Cache extracted results on disk
+    use_cache=True,
+)
+
+result = extract_file_sync("report.pdf", config=config)
+
+print(f"Content ({len(result.content)} chars):")
+print(result.content[:200])
+
+if result.chunks:
+    print(f"\nChunks: {len(result.chunks)}")
+
+print(f"Tables: {len(result.tables)}")
+
+if result.detected_languages:
+    print(f"Languages: {result.detected_languages}")
+
+if result.extraction_method:
+    print(f"Extraction method: {result.extraction_method}")
+```
--- a/docs/snippets/python/api/error_handling.md
+++ b/docs/snippets/python/api/error_handling.md
@@ -0,0 +1,11 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, KreuzbergError
+
+config = ExtractionConfig()
+
+try:
+    result = extract_file_sync("missing.pdf", config=config)
+except KreuzbergError as e:
+    print(f"Extraction failed: {e}")
+    raise
+```
--- a/docs/snippets/python/api/error_handling_extract.md
+++ b/docs/snippets/python/api/error_handling_extract.md
@@ -0,0 +1,27 @@
+```python title="Python"
+from kreuzberg import (
+    batch_extract_files_sync,
+    BatchFileItem,
+    ExtractionConfig,
+    KreuzbergError,
+)
+
+items = [
+    BatchFileItem(path="doc1.pdf"),
+    BatchFileItem(path="doc2.docx"),
+    BatchFileItem(path="missing.html"),
+]
+
+config = ExtractionConfig()
+
+try:
+    results = batch_extract_files_sync(items, config=config)
+    for i, result in enumerate(results):
+        if result.metadata.error:
+            print(f"Document {i}: ERROR - {result.metadata.error}")
+        else:
+            print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
+except KreuzbergError as e:
+    print(f"Batch extraction failed: {e}")
+    raise
+```
--- a/docs/snippets/python/api/extract_bytes_async.md
+++ b/docs/snippets/python/api/extract_bytes_async.md
@@ -0,0 +1,14 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_bytes, ExtractionConfig
+
+async def main() -> None:
+    with open("document.pdf", "rb") as f:
+        content = f.read()
+
+    result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
+    print(result.content[:200])
+    print(f"Tables: {len(result.tables)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/extract_bytes_sync.md
+++ b/docs/snippets/python/api/extract_bytes_sync.md
@@ -0,0 +1,11 @@
+```python title="Python"
+from kreuzberg import extract_bytes_sync, ExtractionConfig
+
+with open("document.pdf", "rb") as f:
+    content = f.read()
+
+result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())
+
+print(result.content[:200])
+print(f"Tables: {len(result.tables)}")
+```
--- a/docs/snippets/python/api/extract_file_async.md
+++ b/docs/snippets/python/api/extract_file_async.md
@@ -0,0 +1,12 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig
+
+async def main() -> None:
+    result = await extract_file("document.pdf", config=ExtractionConfig())
+    print(result.content[:200])
+    print(f"Tables: {len(result.tables)}")
+    print(f"Format: {result.metadata.format_type}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/api/extract_file_sync.md
+++ b/docs/snippets/python/api/extract_file_sync.md
@@ -0,0 +1,9 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig
+
+result = extract_file_sync("document.pdf", config=ExtractionConfig())
+
+print(result.content[:200])
+print(f"Tables: {len(result.tables)}")
+print(f"Format: {result.metadata.format_type}")
+```