Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/api/combining_all_features.md
+++ b/docs/snippets/python/api/combining_all_features.md
@@ -0,0 +1,52 @@
+```python title="Python"
+from kreuzberg import (
+    ExtractionConfig,
+    OcrConfig,
+    ChunkingConfig,
+    ChunkerType,
+    ImageExtractionConfig,
+    OutputFormat,
+    extract_file_sync,
+)
+
+config = ExtractionConfig(
+    # OCR: extract text from images, fallback to Tesseract
+    ocr=OcrConfig(
+        enabled=True,
+        backend="tesseract",
+        language="eng",
+    ),
+    # Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
+    chunking=ChunkingConfig(
+        max_characters=800,
+        overlap=100,
+        chunker_type=ChunkerType.Markdown,
+        prepend_heading_context=True,
+    ),
+    # Output: Markdown format with document structure preserved
+    output_format=OutputFormat.Markdown,
+    include_document_structure=True,
+    # Images: extract embedded images
+    images=ImageExtractionConfig(
+        extract_images=True,
+    ),
+    # Cache extracted results on disk
+    use_cache=True,
+)
+
+result = extract_file_sync("report.pdf", config=config)
+
+print(f"Content ({len(result.content)} chars):")
+print(result.content[:200])
+
+if result.chunks:
+    print(f"\nChunks: {len(result.chunks)}")
+
+print(f"Tables: {len(result.tables)}")
+
+if result.detected_languages:
+    print(f"Languages: {result.detected_languages}")
+
+if result.extraction_method:
+    print(f"Extraction method: {result.extraction_method}")
+```