Files
fil/docs/snippets/php/ocr/advanced_ocr.php

196 lines
5.4 KiB
PHP
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
```php title="advanced_ocr.php"
<?php
declare(strict_types=1);
/**
* Advanced OCR Configuration
*
* Fine-tune OCR performance with Tesseract configuration, image preprocessing,
* and page segmentation modes.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
enableTableDetection: true
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('financial_report_scan.pdf');
echo "OCR with Table Detection:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . ":\n";
echo $table->markdown . "\n\n";
}
$invoiceConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharWhitelist: '0123456789.,€$£¥-/'
)
)
);
$kreuzberg = new Kreuzberg($invoiceConfig);
$result = $kreuzberg->extractFile('invoice_scan.pdf');
echo "Invoice OCR (numbers only):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$preprocessedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true
),
tesseractConfig: new TesseractConfig(
psm: 3
)
)
);
$kreuzberg = new Kreuzberg($preprocessedConfig);
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
echo "OCR with Image Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";
$psmModes = [
0 => 'Orientation and script detection (OSD) only',
1 => 'Automatic page segmentation with OSD',
3 => 'Fully automatic page segmentation (default)',
4 => 'Assume a single column of text',
5 => 'Assume a single uniform block of vertically aligned text',
6 => 'Assume a single uniform block of text',
7 => 'Treat the image as a single text line',
8 => 'Treat the image as a single word',
9 => 'Treat the image as a single word in a circle',
10 => 'Treat the image as a single character',
11 => 'Sparse text - find as much text as possible',
13 => 'Raw line - treat as a single text line',
];
$testFile = 'various_layouts.pdf';
if (file_exists($testFile)) {
echo "Testing different PSM modes:\n";
echo str_repeat('=', 60) . "\n";
foreach ([3, 4, 6, 11] as $psm) {
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: $psm)
)
);
$kreuzberg = new Kreuzberg($config);
$start = microtime(true);
$result = $kreuzberg->extractFile($testFile);
$elapsed = microtime(true) - $start;
echo "PSM $psm - {$psmModes[$psm]}:\n";
echo " Time: " . number_format($elapsed, 3) . "s\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";
}
}
$singleColumnConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 4
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true
)
)
);
$kreuzberg = new Kreuzberg($singleColumnConfig);
$result = $kreuzberg->extractFile('book_scan.pdf');
echo "Single-column OCR:\n";
echo $result->content . "\n\n";
$sparseConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 11
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
sharpen: true
)
)
);
$kreuzberg = new Kreuzberg($sparseConfig);
$result = $kreuzberg->extractFile('receipt.jpg');
echo "Sparse text OCR (receipt):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$highAccuracyConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 3,
enableTableDetection: true
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true,
removeBackground: true
)
)
);
$kreuzberg = new Kreuzberg($highAccuracyConfig);
$result = $kreuzberg->extractFile('legal_document_scan.pdf');
echo "High-accuracy OCR:\n";
echo "Characters: " . strlen($result->content) . "\n";
echo "Tables: " . count($result->tables) . "\n";
```