This commit is contained in:
302
docs/snippets/php/configuration/image_preprocessing_config.php
Normal file
302
docs/snippets/php/configuration/image_preprocessing_config.php
Normal file
@@ -0,0 +1,302 @@
|
||||
```php title="image_preprocessing_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Preprocessing Configuration
|
||||
*
|
||||
* This example demonstrates image preprocessing options to improve OCR accuracy.
|
||||
* Preprocessing can significantly enhance text recognition quality for poor-quality scans.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
echo "Example 1: Default Image Preprocessing\n";
|
||||
echo "======================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig()
|
||||
)
|
||||
);
|
||||
|
||||
echo "Default preprocessing settings:\n";
|
||||
echo "- Target DPI: 300 (standard for OCR)\n";
|
||||
echo "- Auto-rotate: Enabled\n";
|
||||
echo "- Denoise: Disabled\n\n";
|
||||
|
||||
echo "Example 2: High DPI Configuration\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 600
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Target DPI: 600\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Very small text\n";
|
||||
echo "- High-quality scans\n";
|
||||
echo "- Documents with fine details\n";
|
||||
echo "Note: Higher DPI = slower processing, more memory\n\n";
|
||||
|
||||
echo "Example 3: Lower DPI for Speed\n";
|
||||
echo "==============================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 150
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Target DPI: 150\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Large text\n";
|
||||
echo "- Low-resolution images\n";
|
||||
echo "- Fast processing needed\n";
|
||||
echo "Note: May reduce accuracy for small text\n\n";
|
||||
|
||||
echo "Example 4: Manual Rotation Control\n";
|
||||
echo "==================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
autoRotate: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Auto-rotate: Disabled\n";
|
||||
echo "Use when:\n";
|
||||
echo "- Images are already correctly oriented\n";
|
||||
echo "- Auto-rotation causes issues\n";
|
||||
echo "- Processing time is critical\n\n";
|
||||
|
||||
echo "Example 5: Denoising for Poor Quality Scans\n";
|
||||
echo "===========================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config5);
|
||||
$result = $kreuzberg->extractFile('noisy_scan.pdf');
|
||||
|
||||
echo "Denoising: Enabled\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Poor quality scans\n";
|
||||
echo "- Fax documents\n";
|
||||
echo "- Images with background noise\n";
|
||||
echo "- Old or damaged documents\n";
|
||||
echo "\nExtracted text length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
echo "Example 6: Maximum Quality Configuration\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Maximum quality preprocessing:\n";
|
||||
echo "- Target DPI: 600 (high quality)\n";
|
||||
echo "- Auto-rotate: Enabled\n";
|
||||
echo "- Denoise: Enabled\n";
|
||||
echo "\nBest for:\n";
|
||||
echo "- Very poor quality scans\n";
|
||||
echo "- Historical documents\n";
|
||||
echo "- Faded or damaged text\n";
|
||||
echo "- Critical accuracy requirements\n\n";
|
||||
|
||||
echo "Example 7: Fast Processing Configuration\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 200,
|
||||
autoRotate: false,
|
||||
denoise: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Fast processing configuration:\n";
|
||||
echo "- Target DPI: 200 (faster)\n";
|
||||
echo "- Auto-rotate: Disabled\n";
|
||||
echo "- Denoise: Disabled\n";
|
||||
echo "\nBest for:\n";
|
||||
echo "- High-volume processing\n";
|
||||
echo "- Good quality source images\n";
|
||||
echo "- Performance-critical applications\n\n";
|
||||
|
||||
echo "Example 8: DPI Recommendations by Document Type\n";
|
||||
echo "===============================================\n";
|
||||
|
||||
$standardConfig = new ImagePreprocessingConfig(targetDpi: 300);
|
||||
echo "Standard documents (letters, reports): 300 DPI\n";
|
||||
|
||||
$newspaperConfig = new ImagePreprocessingConfig(targetDpi: 400);
|
||||
echo "Newspapers and magazines: 400 DPI\n";
|
||||
|
||||
$bookConfig = new ImagePreprocessingConfig(targetDpi: 600);
|
||||
echo "Books with small text: 600 DPI\n";
|
||||
|
||||
$receiptConfig = new ImagePreprocessingConfig(targetDpi: 300);
|
||||
echo "Receipts and forms: 300 DPI\n";
|
||||
|
||||
$businessCardConfig = new ImagePreprocessingConfig(targetDpi: 400);
|
||||
echo "Business cards: 400 DPI\n";
|
||||
|
||||
$faxConfig = new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true
|
||||
);
|
||||
echo "Faxes: 300 DPI + denoising\n\n";
|
||||
|
||||
echo "Example 9: Adaptive Configuration by Image Quality\n";
|
||||
echo "==================================================\n";
|
||||
|
||||
function getPreprocessingConfig(string $quality): ImagePreprocessingConfig
|
||||
{
|
||||
return match ($quality) {
|
||||
'excellent' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: false,
|
||||
denoise: false
|
||||
),
|
||||
'good' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: false
|
||||
),
|
||||
'fair' => new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
),
|
||||
'poor' => new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
),
|
||||
default => new ImagePreprocessingConfig(),
|
||||
};
|
||||
}
|
||||
|
||||
echo "Quality-based configurations:\n\n";
|
||||
|
||||
echo "Excellent Quality:\n";
|
||||
echo "- DPI: 300, Auto-rotate: No, Denoise: No\n";
|
||||
echo "- Clean scans, properly oriented\n\n";
|
||||
|
||||
echo "Good Quality:\n";
|
||||
echo "- DPI: 300, Auto-rotate: Yes, Denoise: No\n";
|
||||
echo "- May need rotation correction\n\n";
|
||||
|
||||
echo "Fair Quality:\n";
|
||||
echo "- DPI: 400, Auto-rotate: Yes, Denoise: Yes\n";
|
||||
echo "- Some noise or quality issues\n\n";
|
||||
|
||||
echo "Poor Quality:\n";
|
||||
echo "- DPI: 600, Auto-rotate: Yes, Denoise: Yes\n";
|
||||
echo "- Significant quality problems\n\n";
|
||||
|
||||
echo "Example 10: Complete OCR Pipeline with Preprocessing\n";
|
||||
echo "===================================================\n";
|
||||
|
||||
$config10 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result10 = (new Kreuzberg($config10))->extractFile('poor_quality_scan.pdf');
|
||||
|
||||
echo "Processing pipeline:\n";
|
||||
echo "1. Load image\n";
|
||||
echo "2. Auto-detect orientation and rotate if needed\n";
|
||||
echo "3. Upscale/downscale to 300 DPI\n";
|
||||
echo "4. Apply denoising filter\n";
|
||||
echo "5. Perform OCR\n";
|
||||
echo "\nResults:\n";
|
||||
echo "- Extracted text: " . strlen($result10->content) . " characters\n";
|
||||
echo "- Pages: " . ($result10->metadata->pageCount ?? 'N/A') . "\n";
|
||||
|
||||
echo "\n\nImage Preprocessing Parameters:\n";
|
||||
echo "================================\n";
|
||||
echo "- targetDpi: Target resolution in dots per inch\n";
|
||||
echo " * 150 DPI: Fast, lower quality\n";
|
||||
echo " * 300 DPI: Standard, good balance (RECOMMENDED)\n";
|
||||
echo " * 400 DPI: Better for small text\n";
|
||||
echo " * 600 DPI: Best quality, slower\n";
|
||||
echo "\n";
|
||||
echo "- autoRotate: Automatically detect and correct orientation\n";
|
||||
echo " * true: Recommended for most cases\n";
|
||||
echo " * false: Skip if images are already oriented\n";
|
||||
echo "\n";
|
||||
echo "- denoise: Apply noise reduction filter\n";
|
||||
echo " * true: Recommended for poor quality scans\n";
|
||||
echo " * false: Skip for clean images (faster)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Start with 300 DPI as a baseline\n";
|
||||
echo "2. Enable auto-rotate unless you know images are correct\n";
|
||||
echo "3. Enable denoising for poor quality documents\n";
|
||||
echo "4. Use higher DPI (400-600) for small text\n";
|
||||
echo "5. Use lower DPI (150-200) when speed is critical\n";
|
||||
echo "6. Test different settings to find optimal balance\n";
|
||||
echo "7. Consider source quality when choosing settings\n";
|
||||
echo "8. Remember: Higher quality = slower processing + more memory\n";
|
||||
|
||||
echo "\n\nPerformance vs Quality Trade-offs:\n";
|
||||
echo "==================================\n";
|
||||
echo "Fastest: DPI=150, AutoRotate=No, Denoise=No\n";
|
||||
echo "Balanced: DPI=300, AutoRotate=Yes, Denoise=No (RECOMMENDED)\n";
|
||||
echo "Quality: DPI=400, AutoRotate=Yes, Denoise=Yes\n";
|
||||
echo "Maximum: DPI=600, AutoRotate=Yes, Denoise=Yes\n";
|
||||
```
|
||||
Reference in New Issue
Block a user