```php title="image_preprocessing.php" extractFile('scanned.pdf'); echo "OCR with Image Preprocessing:\n"; echo str_repeat('=', 60) . "\n"; echo "Content extracted: " . strlen($result->content) . " characters\n"; echo "Preview: " . substr($result->content, 0, 100) . "...\n\n"; $advancedConfig = new ExtractionConfig( ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( preprocessing: new ImagePreprocessingConfig( targetDpi: 600, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'adaptive', sharpen: true, removeBackground: true ), pageSegmentationMode: 3, engineMode: 3 ) ) ); $kreuzberg = new Kreuzberg($advancedConfig); $result = $kreuzberg->extractFile('poor_quality_scan.pdf'); echo "Advanced Preprocessing Results:\n"; echo str_repeat('=', 60) . "\n"; echo "Content length: " . strlen($result->content) . " characters\n"; if (isset($result->metadata)) { $qualityScore = $result->qualityScore ?? null; $confidence = $result->metadata['ocr_confidence'] ?? null; if ($qualityScore !== null) { echo "Quality score: " . number_format($qualityScore, 2) . "\n"; if ($qualityScore < 0.5) { echo "Warning: Low quality extraction detected\n"; echo "Recommendations:\n"; echo " - Increase target DPI (current: 600)\n"; echo " - Try different binarization method\n"; echo " - Consider rescanning the original document\n"; } } if ($confidence !== null) { echo "OCR confidence: " . number_format($confidence * 100, 1) . "%\n"; } } echo "\n"; $preprocessingProfiles = [ 'basic' => new ImagePreprocessingConfig( targetDpi: 300, denoise: false, deskew: false, contrastEnhance: false ), 'balanced' => new ImagePreprocessingConfig( targetDpi: 300, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'otsu' ), 'aggressive' => new ImagePreprocessingConfig( targetDpi: 600, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'adaptive', sharpen: true, removeBackground: true ), ]; echo "Preprocessing Profile Comparison:\n"; echo str_repeat('=', 60) . "\n"; foreach ($preprocessingProfiles as $profileName => $preprocessing) { $profileConfig = new ExtractionConfig( ocr: new OcrConfig( tesseractConfig: new TesseractConfig( preprocessing: $preprocessing ) ) ); $kreuzberg = new Kreuzberg($profileConfig); $startTime = microtime(true); $result = $kreuzberg->extractFile('sample_scan.pdf'); $elapsedTime = microtime(true) - $startTime; echo ucfirst($profileName) . " profile:\n"; echo " Content length: " . strlen($result->content) . " characters\n"; echo " Processing time: " . number_format($elapsedTime, 3) . " seconds\n"; echo " Settings:\n"; echo " - DPI: {$preprocessing->targetDpi}\n"; echo " - Denoise: " . ($preprocessing->denoise ? 'Yes' : 'No') . "\n"; echo " - Deskew: " . ($preprocessing->deskew ? 'Yes' : 'No') . "\n"; echo " - Binarization: " . ($preprocessing->binarizationMethod ?? 'None') . "\n"; echo "\n"; } function recommendPreprocessingSettings(string $documentType): ImagePreprocessingConfig { return match ($documentType) { 'modern_scan' => new ImagePreprocessingConfig( targetDpi: 300, denoise: true, deskew: true, contrastEnhance: false, binarizationMethod: 'otsu' ), 'old_document' => new ImagePreprocessingConfig( targetDpi: 600, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'adaptive', removeBackground: true ), 'newspaper' => new ImagePreprocessingConfig( targetDpi: 400, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'sauvola', removeBackground: true ), default => new ImagePreprocessingConfig( targetDpi: 300, denoise: true, deskew: true, contrastEnhance: true, binarizationMethod: 'otsu' ), }; } echo "Recommended preprocessing for old documents:\n"; $recommended = recommendPreprocessingSettings('old_document'); echo " Target DPI: {$recommended->targetDpi}\n"; echo " Binarization: {$recommended->binarizationMethod}\n"; ```