This commit is contained in:
195
docs/snippets/php/ocr/advanced_ocr.php
Normal file
195
docs/snippets/php/ocr/advanced_ocr.php
Normal file
@@ -0,0 +1,195 @@
|
||||
```php title="advanced_ocr.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Advanced OCR Configuration
|
||||
*
|
||||
* Fine-tune OCR performance with Tesseract configuration, image preprocessing,
|
||||
* and page segmentation modes.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
enableTableDetection: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('financial_report_scan.pdf');
|
||||
|
||||
echo "OCR with Table Detection:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . ":\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
}
|
||||
|
||||
$invoiceConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
tesseditCharWhitelist: '0123456789.,€$£¥-/'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($invoiceConfig);
|
||||
$result = $kreuzberg->extractFile('invoice_scan.pdf');
|
||||
|
||||
echo "Invoice OCR (numbers only):\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$preprocessedConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
sharpen: true,
|
||||
autoRotate: true,
|
||||
deskew: true
|
||||
),
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 3
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($preprocessedConfig);
|
||||
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
|
||||
|
||||
echo "OCR with Image Preprocessing:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Extracted " . strlen($result->content) . " characters\n";
|
||||
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";
|
||||
|
||||
$psmModes = [
|
||||
0 => 'Orientation and script detection (OSD) only',
|
||||
1 => 'Automatic page segmentation with OSD',
|
||||
3 => 'Fully automatic page segmentation (default)',
|
||||
4 => 'Assume a single column of text',
|
||||
5 => 'Assume a single uniform block of vertically aligned text',
|
||||
6 => 'Assume a single uniform block of text',
|
||||
7 => 'Treat the image as a single text line',
|
||||
8 => 'Treat the image as a single word',
|
||||
9 => 'Treat the image as a single word in a circle',
|
||||
10 => 'Treat the image as a single character',
|
||||
11 => 'Sparse text - find as much text as possible',
|
||||
13 => 'Raw line - treat as a single text line',
|
||||
];
|
||||
|
||||
$testFile = 'various_layouts.pdf';
|
||||
if (file_exists($testFile)) {
|
||||
echo "Testing different PSM modes:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ([3, 4, 6, 11] as $psm) {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(psm: $psm)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$start = microtime(true);
|
||||
$result = $kreuzberg->extractFile($testFile);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "PSM $psm - {$psmModes[$psm]}:\n";
|
||||
echo " Time: " . number_format($elapsed, 3) . "s\n";
|
||||
echo " Characters: " . strlen($result->content) . "\n";
|
||||
echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
$singleColumnConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 4
|
||||
),
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($singleColumnConfig);
|
||||
$result = $kreuzberg->extractFile('book_scan.pdf');
|
||||
|
||||
echo "Single-column OCR:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$sparseConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 11
|
||||
),
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
sharpen: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($sparseConfig);
|
||||
$result = $kreuzberg->extractFile('receipt.jpg');
|
||||
|
||||
echo "Sparse text OCR (receipt):\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$highAccuracyConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 3,
|
||||
enableTableDetection: true
|
||||
),
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
denoise: true,
|
||||
sharpen: true,
|
||||
autoRotate: true,
|
||||
deskew: true,
|
||||
removeBackground: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($highAccuracyConfig);
|
||||
$result = $kreuzberg->extractFile('legal_document_scan.pdf');
|
||||
|
||||
echo "High-accuracy OCR:\n";
|
||||
echo "Characters: " . strlen($result->content) . "\n";
|
||||
echo "Tables: " . count($result->tables) . "\n";
|
||||
```
|
||||
127
docs/snippets/php/ocr/basic_ocr.php
Normal file
127
docs/snippets/php/ocr/basic_ocr.php
Normal file
@@ -0,0 +1,127 @@
|
||||
```php title="basic_ocr.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic OCR with Tesseract
|
||||
*
|
||||
* Extract text from scanned PDFs and images using Tesseract OCR.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
|
||||
echo "OCR Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$multilingualConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+fra+deu'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($multilingualConfig);
|
||||
$result = $kreuzberg->extractFile('multilingual_scan.pdf');
|
||||
|
||||
echo "Multilingual OCR:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo substr($result->content, 0, 500) . "...\n\n";
|
||||
|
||||
$imageConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($imageConfig);
|
||||
|
||||
$imageFormats = ['png', 'jpg', 'tiff'];
|
||||
foreach ($imageFormats as $format) {
|
||||
$file = "scan.$format";
|
||||
if (file_exists($file)) {
|
||||
echo "Processing $file...\n";
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
echo "Extracted " . strlen($result->content) . " characters\n";
|
||||
echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
$languages = [
|
||||
'spa' => 'Spanish document',
|
||||
'fra' => 'French document',
|
||||
'deu' => 'German document',
|
||||
'ita' => 'Italian document',
|
||||
'por' => 'Portuguese document',
|
||||
'rus' => 'Russian document',
|
||||
'jpn' => 'Japanese document',
|
||||
'chi_sim' => 'Chinese (Simplified) document',
|
||||
];
|
||||
|
||||
foreach ($languages as $lang => $description) {
|
||||
$file = strtolower(str_replace(' ', '_', $description)) . '.pdf';
|
||||
|
||||
if (file_exists($file)) {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: $lang
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo "$description ($lang):\n";
|
||||
echo " Characters extracted: " . mb_strlen($result->content) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
);
|
||||
|
||||
$result = extract_file('invoice_scan.pdf', config: $config);
|
||||
|
||||
echo "Invoice OCR:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n";
|
||||
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
|
||||
$contentLength = strlen($result->content);
|
||||
$pageCount = $result->metadata->pageCount ?? 1;
|
||||
$avgCharsPerPage = $contentLength / $pageCount;
|
||||
|
||||
echo "\nOCR Quality Assessment:\n";
|
||||
echo "Total characters: $contentLength\n";
|
||||
echo "Pages: $pageCount\n";
|
||||
echo "Average chars/page: " . number_format($avgCharsPerPage) . "\n";
|
||||
|
||||
if ($avgCharsPerPage < 100) {
|
||||
echo "Warning: Low character count may indicate poor scan quality\n";
|
||||
echo "Consider using image preprocessing or higher DPI settings.\n";
|
||||
} elseif ($avgCharsPerPage > 2000) {
|
||||
echo "Pass: Good - Adequate text extracted\n";
|
||||
} else {
|
||||
echo "Pass: Moderate - Text extracted successfully\n";
|
||||
}
|
||||
```
|
||||
34
docs/snippets/php/ocr/cloud_ocr_backend.md
Normal file
34
docs/snippets/php/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\LlmConfig;
|
||||
|
||||
// Cloud-based OCR using Vision Language Model (VLM)
|
||||
// Requires API key and model configuration
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'vlm',
|
||||
language: 'eng',
|
||||
vlmConfig: new LlmConfig(
|
||||
provider: 'anthropic',
|
||||
apiKey: getenv('ANTHROPIC_API_KEY'),
|
||||
model: 'claude-3-5-sonnet-20241022'
|
||||
),
|
||||
vlmPrompt: 'Extract all text from this document page. Preserve formatting and structure.'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Cloud OCR Results:\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
echo "Preview: " . substr($result->content, 0, 200) . "...\n";
|
||||
?>
|
||||
```
|
||||
39
docs/snippets/php/ocr/image_extraction.md
Normal file
39
docs/snippets/php/ocr/image_extraction.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
|
||||
// Extract images from documents alongside text
|
||||
$config = new ExtractionConfig(
|
||||
images: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
embedAsBase64: false, // Save images to disk
|
||||
maxImagesPerPage: 10
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document_with_images.pdf');
|
||||
|
||||
echo "Extracted Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
if (!empty($result->images)) {
|
||||
echo "Extracted " . count($result->images) . " images\n";
|
||||
foreach ($result->images as $index => $image) {
|
||||
echo "Image " . ($index + 1) . ":\n";
|
||||
echo " Type: " . $image->mimeType . "\n";
|
||||
echo " Size: " . strlen($image->data) . " bytes\n";
|
||||
if (isset($image->width) && isset($image->height)) {
|
||||
echo " Dimensions: " . $image->width . "x" . $image->height . "\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
36
docs/snippets/php/ocr/image_preprocessing.md
Normal file
36
docs/snippets/php/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
// Enhance OCR accuracy with image preprocessing
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
deskew: true,
|
||||
denoise: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'otsu',
|
||||
invertColors: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
|
||||
echo "Preprocessed OCR Results:\n";
|
||||
echo "Characters extracted: " . strlen($result->content) . "\n";
|
||||
echo "Preview: " . substr($result->content, 0, 300) . "...\n";
|
||||
?>
|
||||
```
|
||||
221
docs/snippets/php/ocr/image_preprocessing.php
Normal file
221
docs/snippets/php/ocr/image_preprocessing.php
Normal file
@@ -0,0 +1,221 @@
|
||||
```php title="image_preprocessing.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Preprocessing for OCR
|
||||
*
|
||||
* Improve OCR accuracy by preprocessing images before text recognition.
|
||||
* Useful for poor quality scans, photos, and challenging documents.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
denoise: true,
|
||||
sharpen: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('noisy_scan.pdf');
|
||||
|
||||
echo "Basic Preprocessing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo substr($result->content, 0, 300) . "...\n\n";
|
||||
|
||||
$highDpiConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
denoise: true,
|
||||
sharpen: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($highDpiConfig);
|
||||
$result = $kreuzberg->extractFile('small_text_scan.pdf');
|
||||
|
||||
echo "High DPI Preprocessing:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Characters extracted: " . strlen($result->content) . "\n";
|
||||
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";
|
||||
|
||||
$deskewConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
deskew: true,
|
||||
autoRotate: true,
|
||||
targetDpi: 300
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($deskewConfig);
|
||||
$result = $kreuzberg->extractFile('crooked_scan.pdf');
|
||||
|
||||
echo "Deskewed OCR Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$cleanConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
removeBackground: true,
|
||||
denoise: true,
|
||||
targetDpi: 300
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($cleanConfig);
|
||||
$result = $kreuzberg->extractFile('watermarked_document.pdf');
|
||||
|
||||
echo "Background Removal Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Extracted " . strlen($result->content) . " characters\n";
|
||||
echo "Text quality improved by removing background noise\n\n";
|
||||
|
||||
$comprehensiveConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
denoise: true,
|
||||
sharpen: true,
|
||||
autoRotate: true,
|
||||
deskew: true,
|
||||
removeBackground: true,
|
||||
contrastEnhancement: true,
|
||||
binarize: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($comprehensiveConfig);
|
||||
$result = $kreuzberg->extractFile('very_poor_quality.pdf');
|
||||
|
||||
echo "Comprehensive Preprocessing:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Original quality: Very Poor\n";
|
||||
echo "After preprocessing:\n";
|
||||
echo " Characters: " . strlen($result->content) . "\n";
|
||||
echo " Content preview:\n";
|
||||
echo " " . substr($result->content, 0, 300) . "...\n\n";
|
||||
|
||||
$testFile = 'test_scan.pdf';
|
||||
if (file_exists($testFile)) {
|
||||
$configs = [
|
||||
'No preprocessing' => new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
),
|
||||
'Denoise only' => new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(denoise: true)
|
||||
)
|
||||
),
|
||||
'Denoise + Sharpen' => new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
denoise: true,
|
||||
sharpen: true
|
||||
)
|
||||
)
|
||||
),
|
||||
'Full preprocessing' => $comprehensiveConfig,
|
||||
];
|
||||
|
||||
echo "Preprocessing Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($configs as $name => $config) {
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$start = microtime(true);
|
||||
$result = $kreuzberg->extractFile($testFile);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "$name:\n";
|
||||
echo " Time: " . number_format($elapsed, 3) . "s\n";
|
||||
echo " Characters: " . strlen($result->content) . "\n";
|
||||
echo " Tables: " . count($result->tables) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
function getOptimalPreprocessing(string $file): ImagePreprocessingConfig
|
||||
{
|
||||
$quickScan = new Kreuzberg(new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
));
|
||||
$quickResult = $quickScan->extractFile($file);
|
||||
|
||||
$fileSize = filesize($file);
|
||||
$contentLength = strlen($quickResult->content);
|
||||
$ratio = $contentLength / $fileSize;
|
||||
|
||||
if ($ratio < 0.01) {
|
||||
return new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
denoise: true,
|
||||
sharpen: true,
|
||||
autoRotate: true,
|
||||
deskew: true,
|
||||
removeBackground: true,
|
||||
contrastEnhancement: true
|
||||
);
|
||||
} elseif ($ratio < 0.05) {
|
||||
return new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
sharpen: true,
|
||||
deskew: true
|
||||
);
|
||||
} else {
|
||||
return new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
$file = 'auto_detect_quality.pdf';
|
||||
if (file_exists($file)) {
|
||||
$preprocessing = getOptimalPreprocessing($file);
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: $preprocessing
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo "Adaptive preprocessing applied\n";
|
||||
echo "Result: " . strlen($result->content) . " characters extracted\n";
|
||||
}
|
||||
```
|
||||
40
docs/snippets/php/ocr/ocr_easyocr.md
Normal file
40
docs/snippets/php/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
// Extract text using EasyOCR backend
|
||||
// EasyOCR supports 90+ languages with multi-language detection
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'easyocr',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "EasyOCR Results:\n";
|
||||
echo $result->content . "\n";
|
||||
|
||||
// Multi-language detection
|
||||
$multiLangConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'easyocr',
|
||||
language: 'eng,fra,deu' // English, French, German
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($multiLangConfig);
|
||||
$result = $kreuzberg->extractFile('multilingual_document.pdf');
|
||||
|
||||
echo "\nMulti-language extraction:\n";
|
||||
echo $result->content . "\n";
|
||||
?>
|
||||
```
|
||||
32
docs/snippets/php/ocr/ocr_elements.md
Normal file
32
docs/snippets/php/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'paddle-ocr',
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
|
||||
if ($result->ocrElements !== null) {
|
||||
foreach ($result->ocrElements as $element) {
|
||||
echo "Text: {$element->text}\n";
|
||||
echo "Confidence: " . number_format($element->confidence->recognition, 2) . "\n";
|
||||
echo "Geometry: " . json_encode($element->geometry) . "\n";
|
||||
if ($element->rotation !== null) {
|
||||
echo "Rotation: {$element->rotation->angle}°\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/php/ocr/ocr_extraction.md
Normal file
38
docs/snippets/php/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
// Basic OCR extraction with Tesseract
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
|
||||
echo "Extracted Text:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Extraction Metadata:\n";
|
||||
echo "Page count: " . ($result->metadata->pageCount ?? 'unknown') . "\n";
|
||||
echo "Characters: " . strlen($result->content) . "\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n";
|
||||
|
||||
// Extract from image
|
||||
if (file_exists('scanned_image.png')) {
|
||||
$imageResult = $kreuzberg->extractFile('scanned_image.png');
|
||||
echo "\nImage OCR Results:\n";
|
||||
echo $imageResult->content . "\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
46
docs/snippets/php/ocr/ocr_force_all_pages.md
Normal file
46
docs/snippets/php/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
// Force OCR on all pages, even those with native text
|
||||
// Useful when native text extraction is unreliable or corrupted
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
),
|
||||
// Force OCR on all pages instead of falling back to native text
|
||||
forceOcr: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('mixed_scanned_document.pdf');
|
||||
|
||||
echo "Force OCR Results:\n";
|
||||
echo "All pages processed with OCR\n";
|
||||
echo "Characters extracted: " . strlen($result->content) . "\n";
|
||||
echo "Content preview:\n";
|
||||
echo substr($result->content, 0, 500) . "...\n";
|
||||
|
||||
// Without force OCR - uses native text when available
|
||||
$nativeConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
),
|
||||
forceOcr: false // Default: use native text extraction when available
|
||||
);
|
||||
|
||||
$kreuzbergNative = new Kreuzberg($nativeConfig);
|
||||
$resultNative = $kreuzbergNative->extractFile('mixed_scanned_document.pdf');
|
||||
|
||||
echo "\nNative Text Extraction (no force):\n";
|
||||
echo "Characters extracted: " . strlen($resultNative->content) . "\n";
|
||||
?>
|
||||
```
|
||||
57
docs/snippets/php/ocr/ocr_multi_language.md
Normal file
57
docs/snippets/php/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,57 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
// Extract text from multilingual documents
|
||||
// Specify multiple language codes separated by plus (+)
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+fra+deu' // English, French, German
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('multilingual_document.pdf');
|
||||
|
||||
echo "Multilingual OCR Results:\n";
|
||||
echo "Supported languages: English, French, German\n";
|
||||
echo "Extracted content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
// Language detection with multi-language support
|
||||
$autoDetectConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+spa+fra+deu+ita+por' // Multiple European languages
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($autoDetectConfig);
|
||||
$result = $kreuzberg->extractFile('european_document.pdf');
|
||||
|
||||
echo "European Language Document:\n";
|
||||
echo "Extracted " . strlen($result->content) . " characters\n";
|
||||
echo "Preview: " . substr($result->content, 0, 300) . "...\n\n";
|
||||
|
||||
// Mixed language with language detection
|
||||
$mixedConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+jpn+chi_sim' // English, Japanese, Chinese Simplified
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($mixedConfig);
|
||||
$result = $kreuzberg->extractFile('asian_document.pdf');
|
||||
|
||||
echo "Multi-script Document:\n";
|
||||
echo "Characters extracted: " . mb_strlen($result->content) . "\n";
|
||||
?>
|
||||
```
|
||||
23
docs/snippets/php/ocr/ocr_paddleocr.md
Normal file
23
docs/snippets/php/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'paddle-ocr',
|
||||
language: 'en',
|
||||
// paddleOcrConfig: new PaddleOcrConfig(modelTier: 'server') // for max accuracy
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
|
||||
echo $result->content . "\n";
|
||||
```
|
||||
Reference in New Issue
Block a user