Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/python/ocr/cloud_ocr_backend.md
@@ -0,0 +1,37 @@
+```python title="Python"
+from kreuzberg import register_ocr_backend
+import httpx
+
+class CloudOcrBackend:
+    def __init__(self, api_key: str):
+        self.api_key: str = api_key
+        self.langs: list[str] = ["eng", "deu", "fra"]
+
+    def name(self) -> str:
+        return "cloud-ocr"
+
+    def version(self) -> str:
+        return "1.0.0"
+
+    def supported_languages(self) -> list[str]:
+        return self.langs
+
+    def process_image(self, image_bytes: bytes, config: dict) -> dict:
+        with httpx.Client() as client:
+            response = client.post(
+                "https://api.example.com/ocr",
+                files={"image": image_bytes},
+                json={"language": config.get("language", "eng")},
+            )
+            text: str = response.json()["text"]
+            return {"content": text, "mime_type": "text/plain"}
+
+    def initialize(self) -> None:
+        pass
+
+    def shutdown(self) -> None:
+        pass
+
+backend: CloudOcrBackend = CloudOcrBackend(api_key="your-api-key")
+register_ocr_backend(backend)
+```
--- a/docs/snippets/python/ocr/image_extraction.md
+++ b/docs/snippets/python/ocr/image_extraction.md
@@ -0,0 +1,17 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, ImageExtractionConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    images=ImageExtractionConfig(
+        extract_images=True,
+        target_dpi=200,
+        max_image_dimension=2048,
+        inject_placeholders=True,  # set to False to extract images without markdown references
+        auto_adjust_dpi=True,
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+print(f"Content length: {len(result.content)} characters")
+```
--- a/docs/snippets/python/ocr/image_preprocessing.md
+++ b/docs/snippets/python/ocr/image_preprocessing.md
@@ -0,0 +1,29 @@
+```python title="Python"
+from kreuzberg import (
+    extract_file_sync,
+    ExtractionConfig,
+    ImagePreprocessingConfig,
+    OcrConfig,
+    TesseractConfig,
+)
+
+preprocessing: ImagePreprocessingConfig = ImagePreprocessingConfig(
+    target_dpi=300,
+    denoise=True,
+    deskew=True,
+    contrast_enhance=True,
+    binarization_method="otsu",
+)
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        language="eng",
+        tesseract_config=TesseractConfig(preprocessing=preprocessing),
+    )
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+print(f"Content length: {len(result.content)} characters")
+```
--- a/docs/snippets/python/ocr/ocr_easyocr.md
+++ b/docs/snippets/python/ocr/ocr_easyocr.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="easyocr", language="en")
+)
+
+# EasyOCR-specific options (use_gpu, beam_width, etc.) go in easyocr_kwargs,
+# not in OcrConfig — OcrConfig only accepts backend, language, and backend-specific configs.
+result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True})
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_elements.md
+++ b/docs/snippets/python/ocr/ocr_elements.md
@@ -0,0 +1,18 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="paddleocr", language="en")
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+if result.ocr_elements:
+    for element in result.ocr_elements:
+        print(f"Text: {element.text}")
+        print(f"Confidence: {element.confidence.recognition:.2f}")
+        print(f"Geometry: {element.geometry}")
+        if element.rotation:
+            print(f"Rotation: {element.rotation.angle}°")
+        print()
+```
--- a/docs/snippets/python/ocr/ocr_extraction.md
+++ b/docs/snippets/python/ocr/ocr_extraction.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract", language="eng")
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/python/ocr/ocr_force_all_pages.md
@@ -0,0 +1,17 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract"),
+    force_ocr=True,
+)
+
+result = extract_file_sync("document.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_multi_language.md
+++ b/docs/snippets/python/ocr/ocr_multi_language.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="tesseract", language="eng+deu+fra")
+)
+
+result = extract_file_sync("multilingual.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```
--- a/docs/snippets/python/ocr/ocr_paddleocr.md
+++ b/docs/snippets/python/ocr/ocr_paddleocr.md
@@ -0,0 +1,16 @@
+```python title="Python"
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
+
+config: ExtractionConfig = ExtractionConfig(
+    ocr=OcrConfig(backend="paddleocr", language="en")  # model_tier="server" for max accuracy
+)
+
+result = extract_file_sync("scanned.pdf", config=config)
+
+content: str = result.content
+preview: str = content[:100]
+total_length: int = len(content)
+
+print(f"Extracted content (preview): {preview}")
+print(f"Total characters: {total_length}")
+```