Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/utils/chunking.md
+++ b/docs/snippets/python/utils/chunking.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1500,
+        max_overlap=200,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("all-minilm-l6-v2")
+        ),
+    )
+)
+```
--- a/docs/snippets/python/utils/chunking_rag.md
+++ b/docs/snippets/python/utils/chunking_rag.md
@@ -0,0 +1,36 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=500,
+            max_overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                normalize=True,
+                batch_size=16
+            )
+        )
+    )
+    result = await extract_file("research_paper.pdf", config=config)
+
+    chunks_with_embeddings: list = []
+    for chunk in result.chunks or []:
+        if chunk.embedding:
+            chunks_with_embeddings.append({
+                "content": chunk.content[:100],
+                "embedding_dims": len(chunk.embedding)
+            })
+
+    print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/embedding_with_chunking.md
+++ b/docs/snippets/python/utils/embedding_with_chunking.md
@@ -0,0 +1,21 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=1024,
+        max_overlap=100,
+        embedding=EmbeddingConfig(
+            model=EmbeddingModelType.preset("balanced"),
+            normalize=True,
+            batch_size=32,
+            show_download_progress=False,
+        ),
+    )
+)
+```
--- a/docs/snippets/python/utils/error_handling.md
+++ b/docs/snippets/python/utils/error_handling.md
@@ -0,0 +1,33 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
+from kreuzberg import (
+    KreuzbergError,
+    ParsingError,
+    OCRError,
+    ValidationError,
+)
+
+try:
+    result = extract_file_sync("document.pdf")
+    print(f"Extracted {len(result.content)} characters")
+except FileNotFoundError as e:
+    print(f"File not found: {e}")
+except ParsingError as e:
+    print(f"Failed to parse document: {e}")
+except OCRError as e:
+    print(f"OCR processing failed: {e}")
+except KreuzbergError as e:
+    print(f"Extraction error: {e}")
+
+try:
+    config: ExtractionConfig = ExtractionConfig()
+    pdf_bytes: bytes = b"%PDF-1.4\n"
+    result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
+    print(f"Extracted: {result.content[:100]}")
+except ValidationError as e:
+    print(f"Invalid configuration: {e}")
+except OCRError as e:
+    print(f"OCR failed: {e}")
+except KreuzbergError as e:
+    print(f"Extraction failed: {e}")
+```
--- a/docs/snippets/python/utils/error_handling_extract.md
+++ b/docs/snippets/python/utils/error_handling_extract.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import httpx
+
+try:
+    with httpx.Client() as client:
+        with open("document.pdf", "rb") as f:
+            files: dict = {"files": f}
+            response: httpx.Response = client.post(
+                "http://localhost:8000/extract", files=files
+            )
+            response.raise_for_status()
+            results: list = response.json()
+            print(f"Extracted {len(results)} documents")
+except httpx.HTTPStatusError as e:
+    error: dict = e.response.json()
+    error_type: str = error.get("error_type", "Unknown")
+    message: str = error.get("message", "No message")
+    print(f"Error: {error_type}: {message}")
+```
--- a/docs/snippets/python/utils/image_extraction.md
+++ b/docs/snippets/python/utils/image_extraction.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, ImageExtractionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        images=ImageExtractionConfig(
+            extract_images=True,
+            target_dpi=200,
+            max_image_dimension=2048,
+            inject_placeholders=True,  # set to False to extract images without markdown references
+            auto_adjust_dpi=True,
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Extracted: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/image_preprocessing.md
+++ b/docs/snippets/python/utils/image_preprocessing.md
@@ -0,0 +1,29 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    ExtractionConfig,
+    OcrConfig,
+    TesseractConfig,
+    ImagePreprocessingConfig,
+    extract_file,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        ocr=OcrConfig(
+            tesseract_config=TesseractConfig(
+                preprocessing=ImagePreprocessingConfig(
+                    target_dpi=300,
+                    denoise=True,
+                    deskew=True,
+                    contrast_enhance=True,
+                    binarization_method="otsu",
+                )
+            )
+        )
+    )
+    result = await extract_file("scanned.pdf", config=config)
+    print(f"Content: {result.content[:100]}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/keyword_extraction_example.md
+++ b/docs/snippets/python/utils/keyword_extraction_example.md
@@ -0,0 +1,22 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.YAKE,
+            max_keywords=10,
+            min_score=0.3
+        )
+    )
+    result = await extract_file("research_paper.pdf", config=config)
+
+    keywords: list = result.extracted_keywords or []
+    for kw in keywords:
+        score: float = kw.score or 0.0
+        text: str = kw.text or ""
+        print(f"{text}: {score:.3f}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/language_detection.md
+++ b/docs/snippets/python/utils/language_detection.md
@@ -0,0 +1,15 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True, min_confidence=0.9, detect_multiple=True
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Languages: {result.detected_languages}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/language_detection_multilingual.md
+++ b/docs/snippets/python/utils/language_detection_multilingual.md
@@ -0,0 +1,18 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.7,
+            detect_multiple=True
+        )
+    )
+    result = await extract_file("multilingual_document.pdf", config=config)
+    languages: list[str] = result.detected_languages or []
+    print(f"Detected {len(languages)} languages: {languages}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/quality_processing_example.md
+++ b/docs/snippets/python/utils/quality_processing_example.md
@@ -0,0 +1,14 @@
+```python title="Python"
+from kreuzberg import extract_file, ExtractionConfig
+
+config = ExtractionConfig(enable_quality_processing=True)
+result = extract_file("scanned_document.pdf", config=config)
+
+quality_score = result.quality_score or 0.0
+
+if quality_score < 0.5:
+    print(f"Warning: Low quality extraction ({quality_score:.2f})")
+    print("Consider re-scanning with higher DPI or adjusting OCR settings")
+else:
+    print(f"Quality score: {quality_score:.2f}")
+```
--- a/docs/snippets/python/utils/standalone_embed.md
+++ b/docs/snippets/python/utils/standalone_embed.md
@@ -0,0 +1,19 @@
+```python title="Python"
+from kreuzberg import embed_sync, embed, EmbeddingConfig, EmbeddingModelType
+
+# Synchronous
+embeddings = embed_sync(
+    ["Hello, world!", "Kreuzberg is fast"],
+    config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
+)
+assert len(embeddings) == 2
+assert len(embeddings[0]) == 768
+
+# Asynchronous
+async def main():
+    embeddings = await embed(
+        ["Hello, world!", "Kreuzberg is fast"],
+        config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
+    )
+    assert len(embeddings) == 2
+```
--- a/docs/snippets/python/utils/tables.md
+++ b/docs/snippets/python/utils/tables.md
@@ -0,0 +1,12 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
+
+result = extract_file_sync("document.pdf", config=ExtractionConfig())
+
+for table in result.tables:
+    row_count: int = len(table.cells)
+    print(f"Table with {row_count} rows")
+    print(table.markdown)
+    for row in table.cells:
+        print(row)
+```
--- a/docs/snippets/python/utils/token_reduction.md
+++ b/docs/snippets/python/utils/token_reduction.md
@@ -0,0 +1,15 @@
+```python title="Python"
+import asyncio
+from kreuzberg import ExtractionConfig, TokenReductionConfig, extract_file
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="moderate", preserve_important_words=True
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    print(f"Content length: {len(result.content)}")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/token_reduction_example.md
+++ b/docs/snippets/python/utils/token_reduction_example.md
@@ -0,0 +1,19 @@
+```python title="Python"
+import asyncio
+from kreuzberg import extract_file, ExtractionConfig, TokenReductionConfig
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="moderate", preserve_important_words=True
+        )
+    )
+    result = await extract_file("verbose_document.pdf", config=config)
+    original: int = result.metadata.get("original_token_count", 0)
+    reduced: int = result.metadata.get("token_count", 0)
+    ratio: float = result.metadata.get("token_reduction_ratio", 0.0)
+    print(f"Reduced from {original} to {reduced} tokens")
+    print(f"Reduction: {ratio * 100:.1f}%")
+
+asyncio.run(main())
+```
--- a/docs/snippets/python/utils/vector_database_integration.md
+++ b/docs/snippets/python/utils/vector_database_integration.md
@@ -0,0 +1,28 @@
+```python title="Python"
+import asyncio
+from kreuzberg import (
+    extract_file,
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+)
+
+async def main() -> None:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=512,
+            max_overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"), normalize=True
+            ),
+        )
+    )
+    result = await extract_file("document.pdf", config=config)
+    chunks = result.chunks or []
+    for i, chunk in enumerate(chunks):
+        chunk_id: str = f"doc_chunk_{i}"
+        print(f"Chunk {chunk_id}: {chunk.content[:50]}")
+
+asyncio.run(main())
+```