Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/configuration/ocr_config.php
+++ b/docs/snippets/php/configuration/ocr_config.php
@@ -0,0 +1,205 @@
+```php title="ocr_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * OCR Configuration
+ *
+ * This example demonstrates how to configure OCR (Optical Character Recognition)
+ * for extracting text from scanned documents and images.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\TesseractConfig;
+
+echo "Example 1: Basic OCR Configuration\n";
+echo "==================================\n";
+
+$config1 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'  
+    )
+);
+
+$kreuzberg = new Kreuzberg($config1);
+$result = $kreuzberg->extractFile('scanned_document.pdf');
+echo "Extracted text length: " . strlen($result->content) . " characters\n\n";
+
+echo "Example 2: Multi-Language OCR\n";
+echo "=============================\n";
+
+$config2 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng+fra+deu'  
+    )
+);
+
+echo "Configured for languages: English, French, German\n";
+echo "Use this for multilingual documents\n\n";
+
+echo "Example 3: Language-Specific OCR\n";
+echo "================================\n";
+
+$config3a = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'spa')
+);
+
+$config3b = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'fra')
+);
+
+$config3c = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'deu')
+);
+
+$config3d = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'chi_sim')
+);
+
+$config3e = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'chi_tra')
+);
+
+$config3f = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'jpn')
+);
+
+$config3g = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'kor')
+);
+
+$config3h = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'ara')
+);
+
+echo "Common Tesseract Language Codes:\n";
+echo "- eng: English\n";
+echo "- fra: French\n";
+echo "- deu: German\n";
+echo "- spa: Spanish\n";
+echo "- ita: Italian\n";
+echo "- por: Portuguese\n";
+echo "- rus: Russian\n";
+echo "- chi_sim: Chinese (Simplified)\n";
+echo "- chi_tra: Chinese (Traditional)\n";
+echo "- jpn: Japanese\n";
+echo "- kor: Korean\n";
+echo "- ara: Arabic\n\n";
+
+echo "Example 4: Advanced Tesseract Configuration\n";
+echo "==========================================\n";
+
+$config4 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,                     
+            oem: 3,                     
+            enableTableDetection: true  
+        )
+    )
+);
+
+echo "Tesseract Configuration:\n";
+echo "- PSM (Page Segmentation Mode): 6 (uniform text block)\n";
+echo "- OEM (OCR Engine Mode): 3 (LSTM only)\n";
+echo "- Table Detection: Enabled\n\n";
+
+echo "Example 5: OCR for Forms and Invoices\n";
+echo "=====================================\n";
+
+$config5 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,                      
+            oem: 3,                      
+            enableTableDetection: true,  
+            tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$.,- '
+        )
+    )
+);
+
+echo "Optimized for forms and invoices:\n";
+echo "- Table detection enabled\n";
+echo "- Character whitelist for common form characters\n\n";
+
+echo "Example 6: OCR for Numeric Documents\n";
+echo "====================================\n";
+
+$config6 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            oem: 3,
+            tesseditCharWhitelist: '0123456789$.,- '  
+        )
+    )
+);
+
+echo "Character whitelist: '0123456789$.,- '\n";
+echo "Best for: Invoices, receipts, financial documents\n\n";
+
+echo "Example 7: OCR with Character Blacklist\n";
+echo "=======================================\n";
+
+$config7 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            oem: 3,
+            tesseditCharBlacklist: '|!@#%^&*()'  
+        )
+    )
+);
+
+echo "Character blacklist: '|!@#%^&*()'\n";
+echo "Use to exclude problematic characters\n\n";
+
+echo "\nPage Segmentation Modes (PSM):\n";
+echo "==============================\n";
+echo "0  = Orientation and script detection (OSD) only\n";
+echo "1  = Automatic page segmentation with OSD\n";
+echo "2  = Automatic page segmentation (no OSD or OCR)\n";
+echo "3  = Fully automatic page segmentation (default)\n";
+echo "4  = Assume a single column of text of variable sizes\n";
+echo "5  = Assume a single uniform block of vertically aligned text\n";
+echo "6  = Assume a single uniform block of text (recommended for most)\n";
+echo "7  = Treat the image as a single text line\n";
+echo "8  = Treat the image as a single word\n";
+echo "9  = Treat the image as a single word in a circle\n";
+echo "10 = Treat the image as a single character\n";
+echo "11 = Sparse text. Find as much text as possible\n";
+echo "12 = Sparse text with OSD\n";
+echo "13 = Raw line. Treat the image as a single text line\n";
+
+echo "\n\nOCR Engine Modes (OEM):\n";
+echo "======================\n";
+echo "0 = Legacy engine only\n";
+echo "1 = Neural nets LSTM engine only\n";
+echo "2 = Legacy + LSTM engines\n";
+echo "3 = Default, based on what is available (recommended)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "- Use PSM 6 for general documents\n";
+echo "- Use PSM 11 for sparse text (screenshots, signs)\n";
+echo "- Use OEM 3 (default) for best results\n";
+echo "- Enable table detection for structured documents\n";
+echo "- Use character whitelists for forms/invoices\n";
+echo "- Combine multiple languages with '+' separator\n";
+echo "- Preprocess images for better accuracy (see image_preprocessing.php)\n";
+```