Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
```php title="image_preprocessing.php"
<?php
declare(strict_types=1);
/**
* Image Preprocessing for OCR
*
* Configure image preprocessing settings to improve OCR accuracy on scanned documents.
* Demonstrates various preprocessing techniques like denoising, deskewing, and contrast enhancement.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
tesseractConfig: new TesseractConfig(
preprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
)
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned.pdf');
echo "OCR with Image Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Content extracted: " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
$advancedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
preprocessing: new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
sharpen: true,
removeBackground: true
),
pageSegmentationMode: 3,
engineMode: 3
)
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
echo "Advanced Preprocessing Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Content length: " . strlen($result->content) . " characters\n";
if (isset($result->metadata)) {
$qualityScore = $result->qualityScore ?? null;
$confidence = $result->metadata['ocr_confidence'] ?? null;
if ($qualityScore !== null) {
echo "Quality score: " . number_format($qualityScore, 2) . "\n";
if ($qualityScore < 0.5) {
echo "Warning: Low quality extraction detected\n";
echo "Recommendations:\n";
echo " - Increase target DPI (current: 600)\n";
echo " - Try different binarization method\n";
echo " - Consider rescanning the original document\n";
}
}
if ($confidence !== null) {
echo "OCR confidence: " . number_format($confidence * 100, 1) . "%\n";
}
}
echo "\n";
$preprocessingProfiles = [
'basic' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: false,
deskew: false,
contrastEnhance: false
),
'balanced' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
),
'aggressive' => new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
sharpen: true,
removeBackground: true
),
];
echo "Preprocessing Profile Comparison:\n";
echo str_repeat('=', 60) . "\n";
foreach ($preprocessingProfiles as $profileName => $preprocessing) {
$profileConfig = new ExtractionConfig(
ocr: new OcrConfig(
tesseractConfig: new TesseractConfig(
preprocessing: $preprocessing
)
)
);
$kreuzberg = new Kreuzberg($profileConfig);
$startTime = microtime(true);
$result = $kreuzberg->extractFile('sample_scan.pdf');
$elapsedTime = microtime(true) - $startTime;
echo ucfirst($profileName) . " profile:\n";
echo " Content length: " . strlen($result->content) . " characters\n";
echo " Processing time: " . number_format($elapsedTime, 3) . " seconds\n";
echo " Settings:\n";
echo " - DPI: {$preprocessing->targetDpi}\n";
echo " - Denoise: " . ($preprocessing->denoise ? 'Yes' : 'No') . "\n";
echo " - Deskew: " . ($preprocessing->deskew ? 'Yes' : 'No') . "\n";
echo " - Binarization: " . ($preprocessing->binarizationMethod ?? 'None') . "\n";
echo "\n";
}
function recommendPreprocessingSettings(string $documentType): ImagePreprocessingConfig
{
return match ($documentType) {
'modern_scan' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: false,
binarizationMethod: 'otsu'
),
'old_document' => new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
removeBackground: true
),
'newspaper' => new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'sauvola',
removeBackground: true
),
default => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
),
};
}
echo "Recommended preprocessing for old documents:\n";
$recommended = recommendPreprocessingSettings('old_document');
echo " Target DPI: {$recommended->targetDpi}\n";
echo " Binarization: {$recommended->binarizationMethod}\n";
```