Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,195 @@
```php title="advanced_ocr.php"
<?php
declare(strict_types=1);
/**
* Advanced OCR Configuration
*
* Fine-tune OCR performance with Tesseract configuration, image preprocessing,
* and page segmentation modes.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
enableTableDetection: true
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('financial_report_scan.pdf');
echo "OCR with Table Detection:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . ":\n";
echo $table->markdown . "\n\n";
}
$invoiceConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharWhitelist: '0123456789.,€$£¥-/'
)
)
);
$kreuzberg = new Kreuzberg($invoiceConfig);
$result = $kreuzberg->extractFile('invoice_scan.pdf');
echo "Invoice OCR (numbers only):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$preprocessedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true
),
tesseractConfig: new TesseractConfig(
psm: 3
)
)
);
$kreuzberg = new Kreuzberg($preprocessedConfig);
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
echo "OCR with Image Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";
$psmModes = [
0 => 'Orientation and script detection (OSD) only',
1 => 'Automatic page segmentation with OSD',
3 => 'Fully automatic page segmentation (default)',
4 => 'Assume a single column of text',
5 => 'Assume a single uniform block of vertically aligned text',
6 => 'Assume a single uniform block of text',
7 => 'Treat the image as a single text line',
8 => 'Treat the image as a single word',
9 => 'Treat the image as a single word in a circle',
10 => 'Treat the image as a single character',
11 => 'Sparse text - find as much text as possible',
13 => 'Raw line - treat as a single text line',
];
$testFile = 'various_layouts.pdf';
if (file_exists($testFile)) {
echo "Testing different PSM modes:\n";
echo str_repeat('=', 60) . "\n";
foreach ([3, 4, 6, 11] as $psm) {
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: $psm)
)
);
$kreuzberg = new Kreuzberg($config);
$start = microtime(true);
$result = $kreuzberg->extractFile($testFile);
$elapsed = microtime(true) - $start;
echo "PSM $psm - {$psmModes[$psm]}:\n";
echo " Time: " . number_format($elapsed, 3) . "s\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";
}
}
$singleColumnConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 4
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true
)
)
);
$kreuzberg = new Kreuzberg($singleColumnConfig);
$result = $kreuzberg->extractFile('book_scan.pdf');
echo "Single-column OCR:\n";
echo $result->content . "\n\n";
$sparseConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 11
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
sharpen: true
)
)
);
$kreuzberg = new Kreuzberg($sparseConfig);
$result = $kreuzberg->extractFile('receipt.jpg');
echo "Sparse text OCR (receipt):\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$highAccuracyConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 3,
enableTableDetection: true
),
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true,
removeBackground: true
)
)
);
$kreuzberg = new Kreuzberg($highAccuracyConfig);
$result = $kreuzberg->extractFile('legal_document_scan.pdf');
echo "High-accuracy OCR:\n";
echo "Characters: " . strlen($result->content) . "\n";
echo "Tables: " . count($result->tables) . "\n";
```

View File

@@ -0,0 +1,127 @@
```php title="basic_ocr.php"
<?php
declare(strict_types=1);
/**
* Basic OCR with Tesseract
*
* Extract text from scanned PDFs and images using Tesseract OCR.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "OCR Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$multilingualConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+fra+deu'
)
);
$kreuzberg = new Kreuzberg($multilingualConfig);
$result = $kreuzberg->extractFile('multilingual_scan.pdf');
echo "Multilingual OCR:\n";
echo str_repeat('=', 60) . "\n";
echo substr($result->content, 0, 500) . "...\n\n";
$imageConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($imageConfig);
$imageFormats = ['png', 'jpg', 'tiff'];
foreach ($imageFormats as $format) {
$file = "scan.$format";
if (file_exists($file)) {
echo "Processing $file...\n";
$result = $kreuzberg->extractFile($file);
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
}
}
$languages = [
'spa' => 'Spanish document',
'fra' => 'French document',
'deu' => 'German document',
'ita' => 'Italian document',
'por' => 'Portuguese document',
'rus' => 'Russian document',
'jpn' => 'Japanese document',
'chi_sim' => 'Chinese (Simplified) document',
];
foreach ($languages as $lang => $description) {
$file = strtolower(str_replace(' ', '_', $description)) . '.pdf';
if (file_exists($file)) {
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: $lang
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
echo "$description ($lang):\n";
echo " Characters extracted: " . mb_strlen($result->content) . "\n\n";
}
}
use function Kreuzberg\extract_file;
$config = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
);
$result = extract_file('invoice_scan.pdf', config: $config);
echo "Invoice OCR:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n";
$result = $kreuzberg->extractFile('scanned.pdf');
$contentLength = strlen($result->content);
$pageCount = $result->metadata->pageCount ?? 1;
$avgCharsPerPage = $contentLength / $pageCount;
echo "\nOCR Quality Assessment:\n";
echo "Total characters: $contentLength\n";
echo "Pages: $pageCount\n";
echo "Average chars/page: " . number_format($avgCharsPerPage) . "\n";
if ($avgCharsPerPage < 100) {
echo "Warning: Low character count may indicate poor scan quality\n";
echo "Consider using image preprocessing or higher DPI settings.\n";
} elseif ($avgCharsPerPage > 2000) {
echo "Pass: Good - Adequate text extracted\n";
} else {
echo "Pass: Moderate - Text extracted successfully\n";
}
```

View File

@@ -0,0 +1,34 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\LlmConfig;
// Cloud-based OCR using Vision Language Model (VLM)
// Requires API key and model configuration
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'vlm',
language: 'eng',
vlmConfig: new LlmConfig(
provider: 'anthropic',
apiKey: getenv('ANTHROPIC_API_KEY'),
model: 'claude-3-5-sonnet-20241022'
),
vlmPrompt: 'Extract all text from this document page. Preserve formatting and structure.'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Cloud OCR Results:\n";
echo "Content length: " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 200) . "...\n";
?>
```

View File

@@ -0,0 +1,39 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ImageExtractionConfig;
// Extract images from documents alongside text
$config = new ExtractionConfig(
images: new ImageExtractionConfig(
extractImages: true,
embedAsBase64: false, // Save images to disk
maxImagesPerPage: 10
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document_with_images.pdf');
echo "Extracted Content:\n";
echo $result->content . "\n\n";
if (!empty($result->images)) {
echo "Extracted " . count($result->images) . " images\n";
foreach ($result->images as $index => $image) {
echo "Image " . ($index + 1) . ":\n";
echo " Type: " . $image->mimeType . "\n";
echo " Size: " . strlen($image->data) . " bytes\n";
if (isset($image->width) && isset($image->height)) {
echo " Dimensions: " . $image->width . "x" . $image->height . "\n";
}
echo "\n";
}
}
?>
```

View File

@@ -0,0 +1,36 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
// Enhance OCR accuracy with image preprocessing
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
autoRotate: true,
deskew: true,
denoise: true,
contrastEnhance: true,
binarizationMethod: 'otsu',
invertColors: false
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "Preprocessed OCR Results:\n";
echo "Characters extracted: " . strlen($result->content) . "\n";
echo "Preview: " . substr($result->content, 0, 300) . "...\n";
?>
```

View File

@@ -0,0 +1,221 @@
```php title="image_preprocessing.php"
<?php
declare(strict_types=1);
/**
* Image Preprocessing for OCR
*
* Improve OCR accuracy by preprocessing images before text recognition.
* Useful for poor quality scans, photos, and challenging documents.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
denoise: true,
sharpen: true
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('noisy_scan.pdf');
echo "Basic Preprocessing Results:\n";
echo str_repeat('=', 60) . "\n";
echo substr($result->content, 0, 300) . "...\n\n";
$highDpiConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
sharpen: true
)
)
);
$kreuzberg = new Kreuzberg($highDpiConfig);
$result = $kreuzberg->extractFile('small_text_scan.pdf');
echo "High DPI Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Characters extracted: " . strlen($result->content) . "\n";
echo "Preview: " . substr($result->content, 0, 200) . "...\n\n";
$deskewConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
deskew: true,
autoRotate: true,
targetDpi: 300
)
)
);
$kreuzberg = new Kreuzberg($deskewConfig);
$result = $kreuzberg->extractFile('crooked_scan.pdf');
echo "Deskewed OCR Results:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$cleanConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
removeBackground: true,
denoise: true,
targetDpi: 300
)
)
);
$kreuzberg = new Kreuzberg($cleanConfig);
$result = $kreuzberg->extractFile('watermarked_document.pdf');
echo "Background Removal Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Text quality improved by removing background noise\n\n";
$comprehensiveConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true,
removeBackground: true,
contrastEnhancement: true,
binarize: true
)
)
);
$kreuzberg = new Kreuzberg($comprehensiveConfig);
$result = $kreuzberg->extractFile('very_poor_quality.pdf');
echo "Comprehensive Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Original quality: Very Poor\n";
echo "After preprocessing:\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Content preview:\n";
echo " " . substr($result->content, 0, 300) . "...\n\n";
$testFile = 'test_scan.pdf';
if (file_exists($testFile)) {
$configs = [
'No preprocessing' => new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
),
'Denoise only' => new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(denoise: true)
)
),
'Denoise + Sharpen' => new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
denoise: true,
sharpen: true
)
)
),
'Full preprocessing' => $comprehensiveConfig,
];
echo "Preprocessing Comparison:\n";
echo str_repeat('=', 60) . "\n";
foreach ($configs as $name => $config) {
$kreuzberg = new Kreuzberg($config);
$start = microtime(true);
$result = $kreuzberg->extractFile($testFile);
$elapsed = microtime(true) - $start;
echo "$name:\n";
echo " Time: " . number_format($elapsed, 3) . "s\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Tables: " . count($result->tables) . "\n\n";
}
}
function getOptimalPreprocessing(string $file): ImagePreprocessingConfig
{
$quickScan = new Kreuzberg(new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
));
$quickResult = $quickScan->extractFile($file);
$fileSize = filesize($file);
$contentLength = strlen($quickResult->content);
$ratio = $contentLength / $fileSize;
if ($ratio < 0.01) {
return new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
sharpen: true,
autoRotate: true,
deskew: true,
removeBackground: true,
contrastEnhancement: true
);
} elseif ($ratio < 0.05) {
return new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
sharpen: true,
deskew: true
);
} else {
return new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true
);
}
}
$file = 'auto_detect_quality.pdf';
if (file_exists($file)) {
$preprocessing = getOptimalPreprocessing($file);
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: $preprocessing
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
echo "Adaptive preprocessing applied\n";
echo "Result: " . strlen($result->content) . " characters extracted\n";
}
```

View File

@@ -0,0 +1,40 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
// Extract text using EasyOCR backend
// EasyOCR supports 90+ languages with multi-language detection
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'easyocr',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "EasyOCR Results:\n";
echo $result->content . "\n";
// Multi-language detection
$multiLangConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'easyocr',
language: 'eng,fra,deu' // English, French, German
)
);
$kreuzberg = new Kreuzberg($multiLangConfig);
$result = $kreuzberg->extractFile('multilingual_document.pdf');
echo "\nMulti-language extraction:\n";
echo $result->content . "\n";
?>
```

View File

@@ -0,0 +1,32 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'paddle-ocr',
language: 'en'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned.pdf');
if ($result->ocrElements !== null) {
foreach ($result->ocrElements as $element) {
echo "Text: {$element->text}\n";
echo "Confidence: " . number_format($element->confidence->recognition, 2) . "\n";
echo "Geometry: " . json_encode($element->geometry) . "\n";
if ($element->rotation !== null) {
echo "Rotation: {$element->rotation->angle}°\n";
}
echo "\n";
}
}
```

View File

@@ -0,0 +1,38 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
// Basic OCR extraction with Tesseract
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "Extracted Text:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
echo "Extraction Metadata:\n";
echo "Page count: " . ($result->metadata->pageCount ?? 'unknown') . "\n";
echo "Characters: " . strlen($result->content) . "\n";
echo "Tables found: " . count($result->tables) . "\n";
// Extract from image
if (file_exists('scanned_image.png')) {
$imageResult = $kreuzberg->extractFile('scanned_image.png');
echo "\nImage OCR Results:\n";
echo $imageResult->content . "\n";
}
?>
```

View File

@@ -0,0 +1,46 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
// Force OCR on all pages, even those with native text
// Useful when native text extraction is unreliable or corrupted
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
),
// Force OCR on all pages instead of falling back to native text
forceOcr: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('mixed_scanned_document.pdf');
echo "Force OCR Results:\n";
echo "All pages processed with OCR\n";
echo "Characters extracted: " . strlen($result->content) . "\n";
echo "Content preview:\n";
echo substr($result->content, 0, 500) . "...\n";
// Without force OCR - uses native text when available
$nativeConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
),
forceOcr: false // Default: use native text extraction when available
);
$kreuzbergNative = new Kreuzberg($nativeConfig);
$resultNative = $kreuzbergNative->extractFile('mixed_scanned_document.pdf');
echo "\nNative Text Extraction (no force):\n";
echo "Characters extracted: " . strlen($resultNative->content) . "\n";
?>
```

View File

@@ -0,0 +1,57 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
// Extract text from multilingual documents
// Specify multiple language codes separated by plus (+)
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+fra+deu' // English, French, German
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('multilingual_document.pdf');
echo "Multilingual OCR Results:\n";
echo "Supported languages: English, French, German\n";
echo "Extracted content:\n";
echo $result->content . "\n\n";
// Language detection with multi-language support
$autoDetectConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+spa+fra+deu+ita+por' // Multiple European languages
)
);
$kreuzberg = new Kreuzberg($autoDetectConfig);
$result = $kreuzberg->extractFile('european_document.pdf');
echo "European Language Document:\n";
echo "Extracted " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 300) . "...\n\n";
// Mixed language with language detection
$mixedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+jpn+chi_sim' // English, Japanese, Chinese Simplified
)
);
$kreuzberg = new Kreuzberg($mixedConfig);
$result = $kreuzberg->extractFile('asian_document.pdf');
echo "Multi-script Document:\n";
echo "Characters extracted: " . mb_strlen($result->content) . "\n";
?>
```

View File

@@ -0,0 +1,23 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'paddle-ocr',
language: 'en',
// paddleOcrConfig: new PaddleOcrConfig(modelTier: 'server') // for max accuracy
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo $result->content . "\n";
```