docs/snippets/php/ocr/advanced_ocr.php

```php title="advanced_ocr.php"
<?php

declare(strict_types=1);

/**
 * Advanced OCR Configuration
 *
 * Fine-tune OCR performance with Tesseract configuration, image preprocessing,
 * and page segmentation modes.
 */

require_once __DIR__ . '/vendor/autoload.php';

use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;

$config = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        tesseractConfig: new TesseractConfig(
            psm: 6,  
            enableTableDetection: true
        )
    )
);

$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('financial_report_scan.pdf');

echo "OCR with Table Detection:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";

foreach ($result->tables as $index => $table) {
    echo "Table " . ($index + 1) . ":\n";
    echo $table->markdown . "\n\n";
}

$invoiceConfig = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        tesseractConfig: new TesseractConfig(
            psm: 6,
            tesseditCharWhitelist: '0123456789.,€$£¥-/'  
        )
    )
);

$kreuzberg = new Kreuzberg($invoiceConfig);
$result = $kreuzberg->extractFile('invoice_scan.pdf');

echo "Invoice OCR (numbers only):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";

$preprocessedConfig = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        imagePreprocessing: new ImagePreprocessingConfig(
            targetDpi: 300,      
            denoise: true,       
            sharpen: true,       
            autoRotate: true,    
            deskew: true        
        ),
        tesseractConfig: new TesseractConfig(
            psm: 3  
        )
    )
);

$kreuzberg = new Kreuzberg($preprocessedConfig);
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');

echo "OCR with Image Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";

$psmModes = [
    0 => 'Orientation and script detection (OSD) only',
    1 => 'Automatic page segmentation with OSD',
    3 => 'Fully automatic page segmentation (default)',
    4 => 'Assume a single column of text',
    5 => 'Assume a single uniform block of vertically aligned text',
    6 => 'Assume a single uniform block of text',
    7 => 'Treat the image as a single text line',
    8 => 'Treat the image as a single word',
    9 => 'Treat the image as a single word in a circle',
    10 => 'Treat the image as a single character',
    11 => 'Sparse text - find as much text as possible',
    13 => 'Raw line - treat as a single text line',
];

$testFile = 'various_layouts.pdf';
if (file_exists($testFile)) {
    echo "Testing different PSM modes:\n";
    echo str_repeat('=', 60) . "\n";

    foreach ([3, 4, 6, 11] as $psm) {
        $config = new ExtractionConfig(
            ocr: new OcrConfig(
                backend: 'tesseract',
                language: 'eng',
                tesseractConfig: new TesseractConfig(psm: $psm)
            )
        );

        $kreuzberg = new Kreuzberg($config);
        $start = microtime(true);
        $result = $kreuzberg->extractFile($testFile);
        $elapsed = microtime(true) - $start;

        echo "PSM $psm - {$psmModes[$psm]}:\n";
        echo "  Time: " . number_format($elapsed, 3) . "s\n";
        echo "  Characters: " . strlen($result->content) . "\n";
        echo "  Preview: " . substr($result->content, 0, 80) . "...\n\n";
    }
}

$singleColumnConfig = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        tesseractConfig: new TesseractConfig(
            psm: 4  
        ),
        imagePreprocessing: new ImagePreprocessingConfig(
            targetDpi: 300,
            denoise: true
        )
    )
);

$kreuzberg = new Kreuzberg($singleColumnConfig);
$result = $kreuzberg->extractFile('book_scan.pdf');

echo "Single-column OCR:\n";
echo $result->content . "\n\n";

$sparseConfig = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        tesseractConfig: new TesseractConfig(
            psm: 11  
        ),
        imagePreprocessing: new ImagePreprocessingConfig(
            targetDpi: 300,
            denoise: true,
            sharpen: true
        )
    )
);

$kreuzberg = new Kreuzberg($sparseConfig);
$result = $kreuzberg->extractFile('receipt.jpg');

echo "Sparse text OCR (receipt):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";

$highAccuracyConfig = new ExtractionConfig(
    ocr: new OcrConfig(
        backend: 'tesseract',
        language: 'eng',
        tesseractConfig: new TesseractConfig(
            psm: 3,
            enableTableDetection: true
        ),
        imagePreprocessing: new ImagePreprocessingConfig(
            targetDpi: 400,      
            denoise: true,
            sharpen: true,
            autoRotate: true,
            deskew: true,
            removeBackground: true
        )
    )
);

$kreuzberg = new Kreuzberg($highAccuracyConfig);
$result = $kreuzberg->extractFile('legal_document_scan.pdf');

echo "High-accuracy OCR:\n";
echo "Characters: " . strlen($result->content) . "\n";
echo "Tables: " . count($result->tables) . "\n";
```
Nomad changes 2026-06-01 23:40:55 +02:00			```php title="advanced_ocr.php"
			`<?php`

			`declare(strict_types=1);`

			`/**`
			`* Advanced OCR Configuration`
			`*`
			`* Fine-tune OCR performance with Tesseract configuration, image preprocessing,`
			`* and page segmentation modes.`
			`*/`

			`require_once __DIR__ . '/vendor/autoload.php';`

			`use Kreuzberg\Kreuzberg;`
			`use Kreuzberg\Config\ExtractionConfig;`
			`use Kreuzberg\Config\OcrConfig;`
			`use Kreuzberg\Config\TesseractConfig;`
			`use Kreuzberg\Config\ImagePreprocessingConfig;`

			`$config = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(`
			`psm: 6,`
			`enableTableDetection: true`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($config);`
			`$result = $kreuzberg->extractFile('financial_report_scan.pdf');`

			`echo "OCR with Table Detection:\n";`
			`echo str_repeat('=', 60) . "\n";`
			`echo "Tables found: " . count($result->tables) . "\n\n";`

			`foreach ($result->tables as $index => $table) {`
			`echo "Table " . ($index + 1) . ":\n";`
			`echo $table->markdown . "\n\n";`
			`}`

			`$invoiceConfig = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(`
			`psm: 6,`
			`tesseditCharWhitelist: '0123456789.,€$£¥-/'`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($invoiceConfig);`
			`$result = $kreuzberg->extractFile('invoice_scan.pdf');`

			`echo "Invoice OCR (numbers only):\n";`
			`echo str_repeat('=', 60) . "\n";`
			`echo $result->content . "\n\n";`

			`$preprocessedConfig = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`imagePreprocessing: new ImagePreprocessingConfig(`
			`targetDpi: 300,`
			`denoise: true,`
			`sharpen: true,`
			`autoRotate: true,`
			`deskew: true`
			`),`
			`tesseractConfig: new TesseractConfig(`
			`psm: 3`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($preprocessedConfig);`
			`$result = $kreuzberg->extractFile('poor_quality_scan.pdf');`

			`echo "OCR with Image Preprocessing:\n";`
			`echo str_repeat('=', 60) . "\n";`
			`echo "Extracted " . strlen($result->content) . " characters\n";`
			`echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";`

			`$psmModes = [`
			`0 => 'Orientation and script detection (OSD) only',`
			`1 => 'Automatic page segmentation with OSD',`
			`3 => 'Fully automatic page segmentation (default)',`
			`4 => 'Assume a single column of text',`
			`5 => 'Assume a single uniform block of vertically aligned text',`
			`6 => 'Assume a single uniform block of text',`
			`7 => 'Treat the image as a single text line',`
			`8 => 'Treat the image as a single word',`
			`9 => 'Treat the image as a single word in a circle',`
			`10 => 'Treat the image as a single character',`
			`11 => 'Sparse text - find as much text as possible',`
			`13 => 'Raw line - treat as a single text line',`
			`];`

			`$testFile = 'various_layouts.pdf';`
			`if (file_exists($testFile)) {`
			`echo "Testing different PSM modes:\n";`
			`echo str_repeat('=', 60) . "\n";`

			`foreach ([3, 4, 6, 11] as $psm) {`
			`$config = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(psm: $psm)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($config);`
			`$start = microtime(true);`
			`$result = $kreuzberg->extractFile($testFile);`
			`$elapsed = microtime(true) - $start;`

			`echo "PSM $psm - {$psmModes[$psm]}:\n";`
			`echo " Time: " . number_format($elapsed, 3) . "s\n";`
			`echo " Characters: " . strlen($result->content) . "\n";`
			`echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";`
			`}`
			`}`

			`$singleColumnConfig = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(`
			`psm: 4`
			`),`
			`imagePreprocessing: new ImagePreprocessingConfig(`
			`targetDpi: 300,`
			`denoise: true`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($singleColumnConfig);`
			`$result = $kreuzberg->extractFile('book_scan.pdf');`

			`echo "Single-column OCR:\n";`
			`echo $result->content . "\n\n";`

			`$sparseConfig = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(`
			`psm: 11`
			`),`
			`imagePreprocessing: new ImagePreprocessingConfig(`
			`targetDpi: 300,`
			`denoise: true,`
			`sharpen: true`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($sparseConfig);`
			`$result = $kreuzberg->extractFile('receipt.jpg');`

			`echo "Sparse text OCR (receipt):\n";`
			`echo str_repeat('=', 60) . "\n";`
			`echo $result->content . "\n\n";`

			`$highAccuracyConfig = new ExtractionConfig(`
			`ocr: new OcrConfig(`
			`backend: 'tesseract',`
			`language: 'eng',`
			`tesseractConfig: new TesseractConfig(`
			`psm: 3,`
			`enableTableDetection: true`
			`),`
			`imagePreprocessing: new ImagePreprocessingConfig(`
			`targetDpi: 400,`
			`denoise: true,`
			`sharpen: true,`
			`autoRotate: true,`
			`deskew: true,`
			`removeBackground: true`
			`)`
			`)`
			`);`

			`$kreuzberg = new Kreuzberg($highAccuracyConfig);`
			`$result = $kreuzberg->extractFile('legal_document_scan.pdf');`

			`echo "High-accuracy OCR:\n";`
			`echo "Characters: " . strlen($result->content) . "\n";`
			`echo "Tables: " . count($result->tables) . "\n";`
			```