Files
fil/docs/snippets/php/configuration/ocr_config.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

206 lines
6.1 KiB
PHP

```php title="ocr_config.php"
<?php
declare(strict_types=1);
/**
* OCR Configuration
*
* This example demonstrates how to configure OCR (Optical Character Recognition)
* for extracting text from scanned documents and images.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
echo "Example 1: Basic OCR Configuration\n";
echo "==================================\n";
$config1 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "Extracted text length: " . strlen($result->content) . " characters\n\n";
echo "Example 2: Multi-Language OCR\n";
echo "=============================\n";
$config2 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+fra+deu'
)
);
echo "Configured for languages: English, French, German\n";
echo "Use this for multilingual documents\n\n";
echo "Example 3: Language-Specific OCR\n";
echo "================================\n";
$config3a = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'spa')
);
$config3b = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'fra')
);
$config3c = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'deu')
);
$config3d = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_sim')
);
$config3e = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_tra')
);
$config3f = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'jpn')
);
$config3g = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'kor')
);
$config3h = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'ara')
);
echo "Common Tesseract Language Codes:\n";
echo "- eng: English\n";
echo "- fra: French\n";
echo "- deu: German\n";
echo "- spa: Spanish\n";
echo "- ita: Italian\n";
echo "- por: Portuguese\n";
echo "- rus: Russian\n";
echo "- chi_sim: Chinese (Simplified)\n";
echo "- chi_tra: Chinese (Traditional)\n";
echo "- jpn: Japanese\n";
echo "- kor: Korean\n";
echo "- ara: Arabic\n\n";
echo "Example 4: Advanced Tesseract Configuration\n";
echo "==========================================\n";
$config4 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true
)
)
);
echo "Tesseract Configuration:\n";
echo "- PSM (Page Segmentation Mode): 6 (uniform text block)\n";
echo "- OEM (OCR Engine Mode): 3 (LSTM only)\n";
echo "- Table Detection: Enabled\n\n";
echo "Example 5: OCR for Forms and Invoices\n";
echo "=====================================\n";
$config5 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true,
tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$.,- '
)
)
);
echo "Optimized for forms and invoices:\n";
echo "- Table detection enabled\n";
echo "- Character whitelist for common form characters\n\n";
echo "Example 6: OCR for Numeric Documents\n";
echo "====================================\n";
$config6 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
tesseditCharWhitelist: '0123456789$.,- '
)
)
);
echo "Character whitelist: '0123456789$.,- '\n";
echo "Best for: Invoices, receipts, financial documents\n\n";
echo "Example 7: OCR with Character Blacklist\n";
echo "=======================================\n";
$config7 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
tesseditCharBlacklist: '|!@#%^&*()'
)
)
);
echo "Character blacklist: '|!@#%^&*()'\n";
echo "Use to exclude problematic characters\n\n";
echo "\nPage Segmentation Modes (PSM):\n";
echo "==============================\n";
echo "0 = Orientation and script detection (OSD) only\n";
echo "1 = Automatic page segmentation with OSD\n";
echo "2 = Automatic page segmentation (no OSD or OCR)\n";
echo "3 = Fully automatic page segmentation (default)\n";
echo "4 = Assume a single column of text of variable sizes\n";
echo "5 = Assume a single uniform block of vertically aligned text\n";
echo "6 = Assume a single uniform block of text (recommended for most)\n";
echo "7 = Treat the image as a single text line\n";
echo "8 = Treat the image as a single word\n";
echo "9 = Treat the image as a single word in a circle\n";
echo "10 = Treat the image as a single character\n";
echo "11 = Sparse text. Find as much text as possible\n";
echo "12 = Sparse text with OSD\n";
echo "13 = Raw line. Treat the image as a single text line\n";
echo "\n\nOCR Engine Modes (OEM):\n";
echo "======================\n";
echo "0 = Legacy engine only\n";
echo "1 = Neural nets LSTM engine only\n";
echo "2 = Legacy + LSTM engines\n";
echo "3 = Default, based on what is available (recommended)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "- Use PSM 6 for general documents\n";
echo "- Use PSM 11 for sparse text (screenshots, signs)\n";
echo "- Use OEM 3 (default) for best results\n";
echo "- Enable table detection for structured documents\n";
echo "- Use character whitelists for forms/invoices\n";
echo "- Combine multiple languages with '+' separator\n";
echo "- Preprocess images for better accuracy (see image_preprocessing.php)\n";
```