```php title="tesseract_config.php" extractFile('scanned_invoice.pdf'); echo "Table detection enabled\n"; echo "Best for: Forms, invoices, spreadsheets, reports\n"; if (count($result->tables) > 0) { echo "\nExtracted tables: " . count($result->tables) . "\n"; foreach ($result->tables as $i => $table) { echo "\nTable " . ($i + 1) . ":\n"; echo $table->markdown . "\n"; } } echo "\n\n"; echo "Example 4: Character Whitelisting\n"; echo "=================================\n"; $config4a = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, tesseditCharWhitelist: '0123456789' ) ) ); echo "Whitelist: '0123456789' (digits only)\n"; echo "Best for: Serial numbers, IDs, numeric codes\n\n"; $config4b = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' ) ) ); echo "Whitelist: Letters and numbers only\n"; echo "Best for: Product codes, alphanumeric IDs\n\n"; $config4c = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, tesseditCharWhitelist: '0123456789$€£¥.,- ' ) ) ); echo "Whitelist: '0123456789$€£¥.,- ' (financial data)\n"; echo "Best for: Invoices, receipts, price lists\n\n"; echo "Example 5: Character Blacklisting\n"; echo "=================================\n"; $config5 = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, tesseditCharBlacklist: '|!@#%^&*()' ) ) ); echo "Blacklist: '|!@#%^&*()'\n"; echo "Use to: Exclude problematic characters that cause OCR errors\n\n"; echo "Example 6: OCR Engine Modes\n"; echo "===========================\n"; $config6a = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig(oem: 0) ) ); echo "OEM 0 - Legacy engine:\n"; echo "- Older, simpler algorithm\n"; echo "- Sometimes better for very low-quality scans\n\n"; $config6b = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig(oem: 1) ) ); echo "OEM 1 - LSTM neural network:\n"; echo "- Modern deep learning approach\n"; echo "- Better accuracy for most documents\n\n"; $config6c = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig(oem: 3) ) ); echo "OEM 3 - Default (recommended):\n"; echo "- Chooses best available engine\n"; echo "- Use this unless you have specific needs\n\n"; echo "Example 7: Complete Invoice Processing Configuration\n"; echo "====================================================\n"; $config7 = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, oem: 3, enableTableDetection: true, tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$€£.,- :#/' ) ) ); echo "Invoice-optimized configuration:\n"; echo "- PSM 6: Structured text\n"; echo "- Table detection: Enabled\n"; echo "- Character whitelist: Alphanumeric + currency + common symbols\n"; echo "- Best for: Invoices, receipts, financial documents\n\n"; echo "Example 8: Complete Form Processing Configuration\n"; echo "=================================================\n"; $config8 = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 6, oem: 3, enableTableDetection: true, tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- @' ) ) ); echo "Form-optimized configuration:\n"; echo "- PSM 6: Structured text\n"; echo "- Table detection: Enabled\n"; echo "- Character whitelist: Alphanumeric + common form characters\n"; echo "- Best for: Forms, applications, surveys\n\n"; echo "Example 9: Sparse Text Configuration\n"; echo "====================================\n"; $config9 = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( psm: 11, oem: 3 ) ) ); echo "Sparse text configuration:\n"; echo "- PSM 11: Find scattered text\n"; echo "- Best for: Screenshots, signs, posters, sparse documents\n\n"; echo "\nAll Page Segmentation Modes:\n"; echo "============================\n"; echo "0 = OSD only (orientation and script detection)\n"; echo "1 = Automatic page segmentation with OSD\n"; echo "2 = Automatic page segmentation (no OSD or OCR)\n"; echo "3 = Fully automatic page segmentation (default)\n"; echo "4 = Single column of variable-sized text\n"; echo "5 = Single uniform block of vertically aligned text\n"; echo "6 = Single uniform block of text (RECOMMENDED)\n"; echo "7 = Single text line\n"; echo "8 = Single word\n"; echo "9 = Single word in a circle\n"; echo "10 = Single character\n"; echo "11 = Sparse text (RECOMMENDED for screenshots)\n"; echo "12 = Sparse text with OSD\n"; echo "13 = Raw line\n"; echo "\n\nOCR Engine Modes:\n"; echo "=================\n"; echo "0 = Legacy engine only\n"; echo "1 = LSTM neural network only\n"; echo "2 = Legacy + LSTM\n"; echo "3 = Default (RECOMMENDED)\n"; echo "\n\nBest Practices:\n"; echo "===============\n"; echo "1. Start with PSM 6 and OEM 3 (defaults)\n"; echo "2. Use PSM 11 for sparse/scattered text\n"; echo "3. Enable table detection for structured documents\n"; echo "4. Use character whitelists for constrained input\n"; echo "5. Use blacklists to exclude problem characters\n"; echo "6. Test different PSM values if accuracy is poor\n"; echo "7. Combine with image preprocessing for better results\n"; ```