Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,207 @@
```php title="chunking_config.php"
<?php
declare(strict_types=1);
/**
* Text Chunking Configuration
*
* This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)
* applications. Chunking splits long documents into smaller, semantically meaningful segments.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
echo "Example 1: Basic Chunking\n";
echo "=========================\n";
$config1 = new ExtractionConfig(
chunking: new ChunkingConfig()
);
$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('long_document.pdf');
if ($result->chunks !== null) {
echo "Total chunks: " . count($result->chunks) . "\n";
foreach ($result->chunks as $i => $chunk) {
echo "\nChunk {$i}:\n";
echo "- Text length: {$chunk->metadata->charCount} characters\n";
echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";
if ($chunk->metadata->firstPage !== null) {
echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";
}
}
}
echo "\n\n";
echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";
echo "======================================================================\n";
$config2 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 256,
chunkOverlap: 25,
respectSentences: true,
respectParagraphs: false
)
);
$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";
echo "Example 3: Large Chunks (More context per chunk)\n";
echo "================================================\n";
$config3 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 2000,
chunkOverlap: 200,
respectSentences: true,
respectParagraphs: true
)
);
$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');
echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";
echo "Example 4: RAG-Optimized Configuration\n";
echo "=====================================\n";
$config4 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 512,
chunkOverlap: 50,
respectSentences: true,
respectParagraphs: false
)
);
$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');
if ($result4->chunks !== null) {
echo "Total chunks: " . count($result4->chunks) . "\n";
$chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);
echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";
echo "Min chunk size: " . min($chunkSizes) . " characters\n";
echo "Max chunk size: " . max($chunkSizes) . " characters\n";
}
echo "\n\n";
echo "Example 5: Processing Chunks for Vector Database\n";
echo "================================================\n";
$config5 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 512,
chunkOverlap: 50,
respectSentences: true
)
);
$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
if ($result5->chunks !== null) {
foreach ($result5->chunks as $i => $chunk) {
$documentId = "doc_123";
$chunkData = [
'document_id' => $documentId,
'chunk_index' => $i,
'text' => $chunk->content,
'char_count' => $chunk->metadata->charCount,
'byte_start' => $chunk->metadata->byteStart,
'byte_end' => $chunk->metadata->byteEnd,
'page_range' => $chunk->metadata->firstPage !== null
? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
: null,
];
echo "Prepared chunk {$i} for database insertion\n";
}
}
echo "\n\n";
echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";
echo "========================================================================\n";
$config6 = new ExtractionConfig(
chunking: new ChunkingConfig(
chunkerType: 'markdown',
sizing: [
'type' => 'tokenizer',
'model' => 'Xenova/gpt-4o'
]
)
);
$result6 = (new Kreuzberg($config6))->extractFile('document.md');
if ($result6->chunks !== null) {
echo "Total chunks: " . count($result6->chunks) . "\n";
foreach ($result6->chunks as $i => $chunk) {
echo "\nChunk {$i}:\n";
echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";
if (isset($chunk->metadata->headingContext->headings)) {
$headings = $chunk->metadata->headingContext->headings;
echo "- Headings in context:\n";
foreach ($headings as $heading) {
echo " - Level {$heading->level}: {$heading->text}\n";
}
}
}
}
echo "\n\nChunking Configuration Parameters:\n";
echo "==================================\n";
echo "- maxChunkSize: Maximum number of characters per chunk\n";
echo "- chunkOverlap: Number of overlapping characters between chunks\n";
echo "- respectSentences: Split at sentence boundaries when possible\n";
echo "- respectParagraphs: Split at paragraph boundaries when possible\n";
echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";
echo "- sizing: Sizing strategy configuration\n";
echo " - type: 'character' or 'tokenizer'\n";
echo " - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";
echo "\n\n";
echo "Example 7: Prepend Heading Context\n";
echo "====================================\n";
$config7 = new ExtractionConfig(
chunking: new ChunkingConfig(
chunkerType: 'markdown',
prependHeadingContext: true
)
);
$result7 = (new Kreuzberg($config7))->extractFile('document.md');
if ($result7->chunks !== null) {
echo "Total chunks: " . count($result7->chunks) . "\n";
foreach ($result7->chunks as $i => $chunk) {
// Each chunk's content is prefixed with its heading breadcrumb,
// e.g. "# Section > ## Subsection\n\nActual content..."
echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";
}
}
echo "\nBest Practices:\n";
echo "- Use 256-512 chars for fine-grained retrieval\n";
echo "- Use 1000-2000 chars for more context\n";
echo "- Set overlap to ~10% of chunk size\n";
echo "- Enable respectSentences for better coherence\n";
echo "- Use markdown chunker for structured documents with headings\n";
echo "- Use token-based sizing for LLM token budgets\n";
echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";
```

View File

@@ -0,0 +1,200 @@
```php title="embedding_config.php"
<?php
declare(strict_types=1);
/**
* Embedding Generation Configuration
*
* This example demonstrates how to configure embedding generation for semantic search
* and vector database applications.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
echo "Example 1: Basic Embedding Generation\n";
echo "=====================================\n";
$config1 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 512,
chunkOverlap: 50
),
embedding: new EmbeddingConfig()
);
$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('document.pdf');
if ($result->chunks !== null) {
foreach ($result->chunks as $i => $chunk) {
echo "\nChunk {$i}:\n";
echo "- Text: " . substr($chunk->text, 0, 50) . "...\n";
if ($chunk->embedding !== null) {
echo "- Embedding dimension: " . count($chunk->embedding) . "\n";
echo "- First 5 values: [" . implode(', ', array_slice($chunk->embedding, 0, 5)) . "...]\n";
}
}
}
echo "\n\n";
echo "Example 2: Different Embedding Models\n";
echo "====================================\n";
$config2a = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: true,
batchSize: 32
)
);
echo "Model: all-MiniLM-L6-v2\n";
echo "- Dimensions: 384\n";
echo "- Speed: Very Fast\n";
echo "- Use case: General purpose, quick retrieval\n\n";
$config2b = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-mpnet-base-v2',
normalize: true,
batchSize: 16
)
);
echo "Model: all-mpnet-base-v2\n";
echo "- Dimensions: 768\n";
echo "- Speed: Medium\n";
echo "- Use case: Higher quality semantic search\n\n";
echo "Example 3: Normalized vs Non-Normalized Embeddings\n";
echo "==================================================\n";
$config3a = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: true
)
);
echo "Normalized embeddings:\n";
echo "- Better for cosine similarity\n";
echo "- Values in range [-1, 1]\n";
echo "- Faster similarity computation\n\n";
$config3b = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: false
)
);
echo "Non-normalized embeddings:\n";
echo "- Raw model output\n";
echo "- Useful for specific distance metrics\n\n";
echo "Example 4: Batch Size Configuration\n";
echo "===================================\n";
$config4a = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: true,
batchSize: 8
)
);
echo "Batch size: 8\n";
echo "- Lower memory usage\n";
echo "- Slower processing\n";
echo "- Good for limited resources\n\n";
$config4b = new ExtractionConfig(
chunking: new ChunkingConfig(maxChunkSize: 512),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: true,
batchSize: 64
)
);
echo "Batch size: 64\n";
echo "- Higher memory usage\n";
echo "- Faster processing\n";
echo "- Good for high-performance systems\n\n";
echo "Example 5: Complete RAG Pipeline\n";
echo "================================\n";
$config5 = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChunkSize: 512,
chunkOverlap: 50,
respectSentences: true
),
embedding: new EmbeddingConfig(
model: 'all-MiniLM-L6-v2',
normalize: true,
batchSize: 32
)
);
$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
if ($result5->chunks !== null) {
echo "Processing " . count($result5->chunks) . " chunks with embeddings...\n\n";
$vectorDbData = [];
foreach ($result5->chunks as $i => $chunk) {
if ($chunk->embedding !== null) {
$vectorDbData[] = [
'id' => "chunk_{$i}",
'text' => $chunk->text,
'embedding' => $chunk->embedding,
'metadata' => [
'char_count' => $chunk->metadata->charCount,
'page_range' => $chunk->metadata->firstPage !== null
? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
: null,
],
];
}
}
echo "Prepared " . count($vectorDbData) . " vectors for database\n";
echo "Each vector has " . count($vectorDbData[0]['embedding']) . " dimensions\n";
}
echo "\n\nEmbedding Configuration Parameters:\n";
echo "===================================\n";
echo "- model: Embedding model name\n";
echo " * 'all-MiniLM-L6-v2': 384 dims, fast, general purpose\n";
echo " * 'all-mpnet-base-v2': 768 dims, higher quality\n";
echo "- normalize: L2 normalize embeddings (recommended: true)\n";
echo "- batchSize: Number of chunks to process at once\n";
echo "\nBest Practices:\n";
echo "- Use normalized embeddings for cosine similarity\n";
echo "- Choose batch size based on available memory\n";
echo "- Use all-MiniLM-L6-v2 for speed, all-mpnet-base-v2 for quality\n";
echo "- Combine with chunking for optimal RAG performance\n";
echo "\n\nCommon Embedding Models:\n";
echo "========================\n";
echo "Model | Dimensions | Speed | Use Case\n";
echo "--------------------------|------------|----------|---------------------------\n";
echo "all-MiniLM-L6-v2 | 384 | Fast | General purpose, QA\n";
echo "all-mpnet-base-v2 | 768 | Medium | Better semantic search\n";
echo "paraphrase-MiniLM-L6-v2 | 384 | Fast | Paraphrase detection\n";
echo "paraphrase-mpnet-base-v2 | 768 | Medium | High-quality paraphrase\n";
```

View File

@@ -0,0 +1,65 @@
```php title="extraction_config.php"
<?php
declare(strict_types=1);
/**
* ExtractionConfig - Main Configuration
*
* The ExtractionConfig class is the primary configuration object that controls
* all aspects of document extraction. It can be passed to the Kreuzberg constructor
* or to individual extraction methods.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\PdfConfig;
$config = new ExtractionConfig(
extractImages: true,
extractTables: true,
preserveFormatting: false
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Extracted with images: " . count($result->images ?? []) . "\n";
echo "Extracted with tables: " . count($result->tables) . "\n\n";
$advancedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
),
pdf: new PdfConfig(
extractImages: true,
imageQuality: 95
),
extractImages: true,
extractTables: true,
preserveFormatting: true,
outputFormat: 'markdown'
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('complex_document.pdf');
echo "Advanced extraction complete\n";
echo "Content format: " . ($advancedConfig->outputFormat ?? 'plain') . "\n";
echo "Formatting preserved: " . ($advancedConfig->preserveFormatting ? 'Yes' : 'No') . "\n";
$defaultConfig = new ExtractionConfig(extractTables: false);
$kreuzberg = new Kreuzberg($defaultConfig);
$result1 = $kreuzberg->extractFile('doc1.pdf');
$overrideConfig = new ExtractionConfig(extractTables: true);
$result2 = $kreuzberg->extractFile('doc2.pdf', config: $overrideConfig);
echo "\nDoc1 tables: " . count($result1->tables) . "\n";
echo "Doc2 tables: " . count($result2->tables) . "\n";
```

View File

@@ -0,0 +1,277 @@
```php title="image_extraction_config.php"
<?php
declare(strict_types=1);
/**
* Image Extraction Configuration
*
* This example demonstrates how to configure image extraction from documents,
* including size filtering and OCR on extracted images.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ImageExtractionConfig;
use Kreuzberg\Config\OcrConfig;
echo "Example 1: Basic Image Extraction\n";
echo "=================================\n";
$config1 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true
)
);
$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('presentation.pptx');
if ($result->images !== null) {
echo "Total images extracted: " . count($result->images) . "\n";
foreach ($result->images as $i => $image) {
echo "\nImage {$i}:\n";
echo "- Format: {$image->format}\n";
echo "- Size: {$image->width}x{$image->height} pixels\n";
echo "- Page: {$image->pageNumber}\n";
echo "- Data size: " . strlen($image->data) . " bytes\n";
}
}
echo "\n\n";
echo "Example 2: Image Extraction with Size Filter\n";
echo "============================================\n";
$config2 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 200,
minHeight: 200
)
);
$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
echo "Filtering images smaller than 200x200 pixels\n";
if ($result2->images !== null) {
echo "Filtered images: " . count($result2->images) . "\n";
}
echo "\n\n";
echo "Example 3: Extract Only Large Images\n";
echo "====================================\n";
$config3 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 800,
minHeight: 600
)
);
echo "Configured to extract images >= 800x600 pixels\n";
echo "Good for: Photos, large diagrams, full-page scans\n\n";
echo "Example 4: Extract All Images (Including Thumbnails)\n";
echo "===================================================\n";
$config4 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 50,
minHeight: 50
)
);
echo "Configured to extract images >= 50x50 pixels\n";
echo "Good for: Extracting all images including icons and thumbnails\n\n";
echo "Example 5: Image Extraction with OCR\n";
echo "====================================\n";
$config5 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
performOcr: true,
minWidth: 100,
minHeight: 100
),
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$result5 = (new Kreuzberg($config5))->extractFile('document_with_images.pdf');
if ($result5->images !== null) {
echo "Extracted " . count($result5->images) . " images with OCR:\n\n";
foreach ($result5->images as $i => $image) {
echo "Image {$i} (Page {$image->pageNumber}):\n";
echo "- Size: {$image->width}x{$image->height}\n";
if ($image->ocrResult !== null) {
echo "- OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
echo "- OCR Text Length: " . strlen($image->ocrResult->content) . " characters\n";
}
echo "\n";
}
}
echo "\n\n";
echo "Example 6: Extract and Save Images to Disk\n";
echo "=========================================\n";
$config6 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 200,
minHeight: 200
)
);
$result6 = (new Kreuzberg($config6))->extractFile('presentation.pptx');
if ($result6->images !== null) {
$outputDir = 'extracted_images';
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}
foreach ($result6->images as $i => $image) {
$filename = "{$outputDir}/image_{$i}_page_{$image->pageNumber}.{$image->format}";
$imageData = base64_decode($image->data);
file_put_contents($filename, $imageData);
echo "Saved: {$filename} ({$image->width}x{$image->height})\n";
}
}
echo "\n\n";
echo "Example 7: File Type-Specific Image Extraction\n";
echo "==============================================\n";
$pdfConfig = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 300,
minHeight: 300,
performOcr: false
)
);
$pptxConfig = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 100,
minHeight: 100,
performOcr: false
)
);
$imageConfig = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
performOcr: true,
minWidth: 50,
minHeight: 50
),
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
);
echo "PDF Configuration:\n";
echo "- Min size: 300x300 (larger images only)\n";
echo "- OCR: Disabled (PDFs have embedded text)\n\n";
echo "PowerPoint Configuration:\n";
echo "- Min size: 100x100 (include icons/logos)\n";
echo "- OCR: Disabled\n\n";
echo "Image File Configuration:\n";
echo "- Min size: 50x50 (all images)\n";
echo "- OCR: Enabled\n\n";
echo "Example 8: Complete Image Processing Pipeline\n";
echo "=============================================\n";
$config8 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
performOcr: true,
minWidth: 200,
minHeight: 200
),
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$result8 = (new Kreuzberg($config8))->extractFile('mixed_content.pdf');
if ($result8->images !== null) {
echo "Extracted images: " . count($result8->images) . "\n\n";
foreach ($result8->images as $i => $image) {
echo "Processing Image {$i}:\n";
$isValid = $image->width >= 200 && $image->height >= 200;
echo "- Valid size: " . ($isValid ? 'Yes' : 'No') . "\n";
$filename = "image_{$i}.{$image->format}";
file_put_contents($filename, base64_decode($image->data));
echo "- Saved: {$filename}\n";
if ($image->ocrResult !== null) {
$ocrText = trim($image->ocrResult->content);
if (!empty($ocrText)) {
echo "- OCR text available: " . strlen($ocrText) . " characters\n";
file_put_contents("image_{$i}_ocr.txt", $ocrText);
}
}
$metadata = [
'format' => $image->format,
'width' => $image->width,
'height' => $image->height,
'page' => $image->pageNumber,
'aspect_ratio' => round($image->width / $image->height, 2),
];
file_put_contents("image_{$i}_metadata.json", json_encode($metadata, JSON_PRETTY_PRINT));
echo "- Metadata saved\n\n";
}
}
echo "\nImage Extraction Configuration Parameters:\n";
echo "==========================================\n";
echo "- extractImages: Enable image extraction (default: false)\n";
echo "- performOcr: Run OCR on extracted images (default: false)\n";
echo "- minWidth: Minimum image width in pixels (default: 100)\n";
echo "- minHeight: Minimum image height in pixels (default: 100)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "- Set minWidth/minHeight to filter out unwanted small images\n";
echo "- Use 200x200 as a good default for meaningful images\n";
echo "- Use 800x600+ for large photos and diagrams only\n";
echo "- Use 50x50 to include all images including icons\n";
echo "- Enable performOcr only when images contain text\n";
echo "- Combine with OCR config for multilingual text in images\n";
echo "- Save images to disk for further processing\n";
echo "\n\nCommon Use Cases:\n";
echo "=================\n";
echo "1. Extract photos from reports: minWidth=800, minHeight=600\n";
echo "2. Extract all graphics: minWidth=100, minHeight=100\n";
echo "3. OCR on images: performOcr=true + OcrConfig\n";
echo "4. Extract logos/icons: minWidth=50, minHeight=50\n";
```

View File

@@ -0,0 +1,302 @@
```php title="image_preprocessing_config.php"
<?php
declare(strict_types=1);
/**
* Image Preprocessing Configuration
*
* This example demonstrates image preprocessing options to improve OCR accuracy.
* Preprocessing can significantly enhance text recognition quality for poor-quality scans.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
echo "Example 1: Default Image Preprocessing\n";
echo "======================================\n";
$config1 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig()
)
);
echo "Default preprocessing settings:\n";
echo "- Target DPI: 300 (standard for OCR)\n";
echo "- Auto-rotate: Enabled\n";
echo "- Denoise: Disabled\n\n";
echo "Example 2: High DPI Configuration\n";
echo "=================================\n";
$config2 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 600
)
)
);
echo "Target DPI: 600\n";
echo "Best for:\n";
echo "- Very small text\n";
echo "- High-quality scans\n";
echo "- Documents with fine details\n";
echo "Note: Higher DPI = slower processing, more memory\n\n";
echo "Example 3: Lower DPI for Speed\n";
echo "==============================\n";
$config3 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 150
)
)
);
echo "Target DPI: 150\n";
echo "Best for:\n";
echo "- Large text\n";
echo "- Low-resolution images\n";
echo "- Fast processing needed\n";
echo "Note: May reduce accuracy for small text\n\n";
echo "Example 4: Manual Rotation Control\n";
echo "==================================\n";
$config4 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
autoRotate: false
)
)
);
echo "Auto-rotate: Disabled\n";
echo "Use when:\n";
echo "- Images are already correctly oriented\n";
echo "- Auto-rotation causes issues\n";
echo "- Processing time is critical\n\n";
echo "Example 5: Denoising for Poor Quality Scans\n";
echo "===========================================\n";
$config5 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
autoRotate: true,
denoise: true
)
)
);
$kreuzberg = new Kreuzberg($config5);
$result = $kreuzberg->extractFile('noisy_scan.pdf');
echo "Denoising: Enabled\n";
echo "Best for:\n";
echo "- Poor quality scans\n";
echo "- Fax documents\n";
echo "- Images with background noise\n";
echo "- Old or damaged documents\n";
echo "\nExtracted text length: " . strlen($result->content) . " characters\n\n";
echo "Example 6: Maximum Quality Configuration\n";
echo "========================================\n";
$config6 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 600,
autoRotate: true,
denoise: true
)
)
);
echo "Maximum quality preprocessing:\n";
echo "- Target DPI: 600 (high quality)\n";
echo "- Auto-rotate: Enabled\n";
echo "- Denoise: Enabled\n";
echo "\nBest for:\n";
echo "- Very poor quality scans\n";
echo "- Historical documents\n";
echo "- Faded or damaged text\n";
echo "- Critical accuracy requirements\n\n";
echo "Example 7: Fast Processing Configuration\n";
echo "========================================\n";
$config7 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 200,
autoRotate: false,
denoise: false
)
)
);
echo "Fast processing configuration:\n";
echo "- Target DPI: 200 (faster)\n";
echo "- Auto-rotate: Disabled\n";
echo "- Denoise: Disabled\n";
echo "\nBest for:\n";
echo "- High-volume processing\n";
echo "- Good quality source images\n";
echo "- Performance-critical applications\n\n";
echo "Example 8: DPI Recommendations by Document Type\n";
echo "===============================================\n";
$standardConfig = new ImagePreprocessingConfig(targetDpi: 300);
echo "Standard documents (letters, reports): 300 DPI\n";
$newspaperConfig = new ImagePreprocessingConfig(targetDpi: 400);
echo "Newspapers and magazines: 400 DPI\n";
$bookConfig = new ImagePreprocessingConfig(targetDpi: 600);
echo "Books with small text: 600 DPI\n";
$receiptConfig = new ImagePreprocessingConfig(targetDpi: 300);
echo "Receipts and forms: 300 DPI\n";
$businessCardConfig = new ImagePreprocessingConfig(targetDpi: 400);
echo "Business cards: 400 DPI\n";
$faxConfig = new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true
);
echo "Faxes: 300 DPI + denoising\n\n";
echo "Example 9: Adaptive Configuration by Image Quality\n";
echo "==================================================\n";
function getPreprocessingConfig(string $quality): ImagePreprocessingConfig
{
return match ($quality) {
'excellent' => new ImagePreprocessingConfig(
targetDpi: 300,
autoRotate: false,
denoise: false
),
'good' => new ImagePreprocessingConfig(
targetDpi: 300,
autoRotate: true,
denoise: false
),
'fair' => new ImagePreprocessingConfig(
targetDpi: 400,
autoRotate: true,
denoise: true
),
'poor' => new ImagePreprocessingConfig(
targetDpi: 600,
autoRotate: true,
denoise: true
),
default => new ImagePreprocessingConfig(),
};
}
echo "Quality-based configurations:\n\n";
echo "Excellent Quality:\n";
echo "- DPI: 300, Auto-rotate: No, Denoise: No\n";
echo "- Clean scans, properly oriented\n\n";
echo "Good Quality:\n";
echo "- DPI: 300, Auto-rotate: Yes, Denoise: No\n";
echo "- May need rotation correction\n\n";
echo "Fair Quality:\n";
echo "- DPI: 400, Auto-rotate: Yes, Denoise: Yes\n";
echo "- Some noise or quality issues\n\n";
echo "Poor Quality:\n";
echo "- DPI: 600, Auto-rotate: Yes, Denoise: Yes\n";
echo "- Significant quality problems\n\n";
echo "Example 10: Complete OCR Pipeline with Preprocessing\n";
echo "===================================================\n";
$config10 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
imagePreprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
autoRotate: true,
denoise: true
)
)
);
$result10 = (new Kreuzberg($config10))->extractFile('poor_quality_scan.pdf');
echo "Processing pipeline:\n";
echo "1. Load image\n";
echo "2. Auto-detect orientation and rotate if needed\n";
echo "3. Upscale/downscale to 300 DPI\n";
echo "4. Apply denoising filter\n";
echo "5. Perform OCR\n";
echo "\nResults:\n";
echo "- Extracted text: " . strlen($result10->content) . " characters\n";
echo "- Pages: " . ($result10->metadata->pageCount ?? 'N/A') . "\n";
echo "\n\nImage Preprocessing Parameters:\n";
echo "================================\n";
echo "- targetDpi: Target resolution in dots per inch\n";
echo " * 150 DPI: Fast, lower quality\n";
echo " * 300 DPI: Standard, good balance (RECOMMENDED)\n";
echo " * 400 DPI: Better for small text\n";
echo " * 600 DPI: Best quality, slower\n";
echo "\n";
echo "- autoRotate: Automatically detect and correct orientation\n";
echo " * true: Recommended for most cases\n";
echo " * false: Skip if images are already oriented\n";
echo "\n";
echo "- denoise: Apply noise reduction filter\n";
echo " * true: Recommended for poor quality scans\n";
echo " * false: Skip for clean images (faster)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "1. Start with 300 DPI as a baseline\n";
echo "2. Enable auto-rotate unless you know images are correct\n";
echo "3. Enable denoising for poor quality documents\n";
echo "4. Use higher DPI (400-600) for small text\n";
echo "5. Use lower DPI (150-200) when speed is critical\n";
echo "6. Test different settings to find optimal balance\n";
echo "7. Consider source quality when choosing settings\n";
echo "8. Remember: Higher quality = slower processing + more memory\n";
echo "\n\nPerformance vs Quality Trade-offs:\n";
echo "==================================\n";
echo "Fastest: DPI=150, AutoRotate=No, Denoise=No\n";
echo "Balanced: DPI=300, AutoRotate=Yes, Denoise=No (RECOMMENDED)\n";
echo "Quality: DPI=400, AutoRotate=Yes, Denoise=Yes\n";
echo "Maximum: DPI=600, AutoRotate=Yes, Denoise=Yes\n";
```

View File

@@ -0,0 +1,115 @@
```php title="keyword_config.php"
<?php
declare(strict_types=1);
/**
* KeywordConfig - Keyword Extraction
*
* Automatically extract keywords and key phrases from documents.
* Useful for document categorization, search indexing, and summarization.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\KeywordConfig;
$config = new ExtractionConfig(
keyword: new KeywordConfig(
maxKeywords: 10,
minScore: 0.0,
language: 'en'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('article.pdf');
echo "Top Keywords:\n";
echo str_repeat('=', 40) . "\n";
foreach ($result->metadata->keywords ?? [] as $keyword) {
echo " • $keyword\n";
}
echo "\n";
$detailedConfig = new ExtractionConfig(
keyword: new KeywordConfig(
maxKeywords: 25,
minScore: 0.0,
language: 'en'
)
);
$kreuzberg = new Kreuzberg($detailedConfig);
$result = $kreuzberg->extractFile('research_paper.pdf');
echo "Detailed keyword analysis:\n";
echo "Total keywords: " . count($result->metadata->keywords ?? []) . "\n";
if (!empty($result->metadata->keywords)) {
$grouped = [];
foreach ($result->metadata->keywords as $keyword) {
$first = strtoupper($keyword[0]);
if (!isset($grouped[$first])) {
$grouped[$first] = [];
}
$grouped[$first][] = $keyword;
}
foreach ($grouped as $letter => $keywords) {
echo "\n$letter:\n";
foreach ($keywords as $keyword) {
echo " - $keyword\n";
}
}
}
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
$allKeywords = [];
foreach ($files as $file) {
if (!file_exists($file)) continue;
$result = $kreuzberg->extractFile($file);
foreach ($result->metadata->keywords ?? [] as $keyword) {
if (!isset($allKeywords[$keyword])) {
$allKeywords[$keyword] = 0;
}
$allKeywords[$keyword]++;
}
}
arsort($allKeywords);
echo "\n\nMost common keywords across documents:\n";
$count = 0;
foreach ($allKeywords as $keyword => $frequency) {
if ($count++ >= 10) break;
echo sprintf(" %2d. %-30s (appears in %d documents)\n",
$count, $keyword, $frequency);
}
$categoryKeywords = [
'technology' => ['software', 'computer', 'algorithm', 'data', 'system'],
'business' => ['market', 'revenue', 'sales', 'customer', 'profit'],
'science' => ['research', 'experiment', 'hypothesis', 'analysis', 'study'],
];
$docKeywords = $result->metadata->keywords ?? [];
$scores = [];
foreach ($categoryKeywords as $category => $terms) {
$score = 0;
foreach ($terms as $term) {
if (in_array($term, $docKeywords, true)) {
$score++;
}
}
$scores[$category] = $score;
}
arsort($scores);
$topCategory = array_key_first($scores);
echo "\nDocument category: $topCategory (score: {$scores[$topCategory]})\n";
```

View File

@@ -0,0 +1,97 @@
```php title="language_detection_config.php"
<?php
declare(strict_types=1);
/**
* LanguageDetectionConfig - Language Detection
*
* Automatically detect the languages present in a document.
* Useful for multilingual documents and routing to appropriate OCR languages.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('multilingual.pdf');
echo "Detected languages:\n";
foreach ($result->detectedLanguages ?? [] as $lang) {
echo " - $lang\n";
}
echo "\n";
$advancedConfig = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
maxLanguages: 3,
confidenceThreshold: 0.8
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('document.pdf');
if (!empty($result->detectedLanguages)) {
echo "High-confidence languages detected:\n";
echo implode(', ', $result->detectedLanguages) . "\n\n";
} else {
echo "No languages detected with sufficient confidence\n\n";
}
use Kreuzberg\Config\OcrConfig;
$detectConfig = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(enabled: true)
);
$kreuzberg = new Kreuzberg($detectConfig);
$result = $kreuzberg->extractFile('scanned.pdf');
if (!empty($result->detectedLanguages)) {
$primaryLanguage = $result->detectedLanguages[0];
echo "Primary language detected: $primaryLanguage\n";
echo "Re-processing with OCR optimized for $primaryLanguage...\n";
$ocrConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: $primaryLanguage
)
);
$kreuzberg = new Kreuzberg($ocrConfig);
$result = $kreuzberg->extractFile('scanned.pdf');
echo "OCR extraction complete\n";
}
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
$languageMap = [];
foreach ($files as $file) {
if (!file_exists($file)) continue;
$result = $kreuzberg->extractFile($file);
$lang = $result->detectedLanguages[0] ?? 'unknown';
if (!isset($languageMap[$lang])) {
$languageMap[$lang] = [];
}
$languageMap[$lang][] = $file;
}
echo "\nDocuments grouped by language:\n";
foreach ($languageMap as $lang => $docs) {
echo "$lang: " . implode(', ', $docs) . "\n";
}
```

View File

@@ -0,0 +1,205 @@
```php title="ocr_config.php"
<?php
declare(strict_types=1);
/**
* OCR Configuration
*
* This example demonstrates how to configure OCR (Optical Character Recognition)
* for extracting text from scanned documents and images.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
echo "Example 1: Basic OCR Configuration\n";
echo "==================================\n";
$config1 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "Extracted text length: " . strlen($result->content) . " characters\n\n";
echo "Example 2: Multi-Language OCR\n";
echo "=============================\n";
$config2 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+fra+deu'
)
);
echo "Configured for languages: English, French, German\n";
echo "Use this for multilingual documents\n\n";
echo "Example 3: Language-Specific OCR\n";
echo "================================\n";
$config3a = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'spa')
);
$config3b = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'fra')
);
$config3c = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'deu')
);
$config3d = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_sim')
);
$config3e = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_tra')
);
$config3f = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'jpn')
);
$config3g = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'kor')
);
$config3h = new ExtractionConfig(
ocr: new OcrConfig(backend: 'tesseract', language: 'ara')
);
echo "Common Tesseract Language Codes:\n";
echo "- eng: English\n";
echo "- fra: French\n";
echo "- deu: German\n";
echo "- spa: Spanish\n";
echo "- ita: Italian\n";
echo "- por: Portuguese\n";
echo "- rus: Russian\n";
echo "- chi_sim: Chinese (Simplified)\n";
echo "- chi_tra: Chinese (Traditional)\n";
echo "- jpn: Japanese\n";
echo "- kor: Korean\n";
echo "- ara: Arabic\n\n";
echo "Example 4: Advanced Tesseract Configuration\n";
echo "==========================================\n";
$config4 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true
)
)
);
echo "Tesseract Configuration:\n";
echo "- PSM (Page Segmentation Mode): 6 (uniform text block)\n";
echo "- OEM (OCR Engine Mode): 3 (LSTM only)\n";
echo "- Table Detection: Enabled\n\n";
echo "Example 5: OCR for Forms and Invoices\n";
echo "=====================================\n";
$config5 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true,
tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$.,- '
)
)
);
echo "Optimized for forms and invoices:\n";
echo "- Table detection enabled\n";
echo "- Character whitelist for common form characters\n\n";
echo "Example 6: OCR for Numeric Documents\n";
echo "====================================\n";
$config6 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
tesseditCharWhitelist: '0123456789$.,- '
)
)
);
echo "Character whitelist: '0123456789$.,- '\n";
echo "Best for: Invoices, receipts, financial documents\n\n";
echo "Example 7: OCR with Character Blacklist\n";
echo "=======================================\n";
$config7 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
tesseditCharBlacklist: '|!@#%^&*()'
)
)
);
echo "Character blacklist: '|!@#%^&*()'\n";
echo "Use to exclude problematic characters\n\n";
echo "\nPage Segmentation Modes (PSM):\n";
echo "==============================\n";
echo "0 = Orientation and script detection (OSD) only\n";
echo "1 = Automatic page segmentation with OSD\n";
echo "2 = Automatic page segmentation (no OSD or OCR)\n";
echo "3 = Fully automatic page segmentation (default)\n";
echo "4 = Assume a single column of text of variable sizes\n";
echo "5 = Assume a single uniform block of vertically aligned text\n";
echo "6 = Assume a single uniform block of text (recommended for most)\n";
echo "7 = Treat the image as a single text line\n";
echo "8 = Treat the image as a single word\n";
echo "9 = Treat the image as a single word in a circle\n";
echo "10 = Treat the image as a single character\n";
echo "11 = Sparse text. Find as much text as possible\n";
echo "12 = Sparse text with OSD\n";
echo "13 = Raw line. Treat the image as a single text line\n";
echo "\n\nOCR Engine Modes (OEM):\n";
echo "======================\n";
echo "0 = Legacy engine only\n";
echo "1 = Neural nets LSTM engine only\n";
echo "2 = Legacy + LSTM engines\n";
echo "3 = Default, based on what is available (recommended)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "- Use PSM 6 for general documents\n";
echo "- Use PSM 11 for sparse text (screenshots, signs)\n";
echo "- Use OEM 3 (default) for best results\n";
echo "- Enable table detection for structured documents\n";
echo "- Use character whitelists for forms/invoices\n";
echo "- Combine multiple languages with '+' separator\n";
echo "- Preprocess images for better accuracy (see image_preprocessing.php)\n";
```

View File

@@ -0,0 +1,82 @@
```php title="page_config.php"
<?php
declare(strict_types=1);
/**
* PageConfig - Page-Level Extraction
*
* Configure per-page content extraction and page markers for maintaining
* document structure in the extracted text.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PageConfig;
$config = new ExtractionConfig(
page: new PageConfig(
extractPages: false,
insertPageMarkers: true,
markerFormat: '--- Page {page_number} ---'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('report.pdf');
echo "Content with page markers:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$pageConfig = new ExtractionConfig(
page: new PageConfig(
extractPages: true,
insertPageMarkers: false
)
);
$kreuzberg = new Kreuzberg($pageConfig);
$result = $kreuzberg->extractFile('multi_page.pdf');
foreach ($result->pages ?? [] as $page) {
echo "Page {$page->pageNumber}:\n";
echo str_repeat('-', 60) . "\n";
echo substr($page->content, 0, 200) . "...\n";
echo "Tables on page: " . count($page->tables) . "\n";
echo "Images on page: " . count($page->images) . "\n\n";
}
$customConfig = new ExtractionConfig(
page: new PageConfig(
extractPages: false,
insertPageMarkers: true,
markerFormat: "\n\n========== PAGE {page_number} ==========\n\n"
)
);
$kreuzberg = new Kreuzberg($customConfig);
$result = $kreuzberg->extractFile('document.pdf');
$pages = preg_split('/={10} PAGE \d+ ={10}/', $result->content);
echo "Split into " . count($pages) . " sections\n";
$allPagesConfig = new ExtractionConfig(
page: new PageConfig(extractPages: true)
);
$kreuzberg = new Kreuzberg($allPagesConfig);
$result = $kreuzberg->extractFile('large_doc.pdf');
$selectedPages = array_filter(
$result->pages ?? [],
fn($page) => $page->pageNumber >= 10 && $page->pageNumber <= 20
);
echo "\nSelected pages 10-20:\n";
foreach ($selectedPages as $page) {
echo "Page {$page->pageNumber}: " . strlen($page->content) . " chars\n";
}
```

View File

@@ -0,0 +1,70 @@
```php title="pdf_config.php"
<?php
declare(strict_types=1);
/**
* PdfConfig - PDF-Specific Configuration
*
* Configure PDF extraction behavior including image quality, text extraction
* methods, and performance optimization.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
$config = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
imageQuality: 85,
preserveImageFormat: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "PDF extraction complete\n";
echo "Images extracted: " . count($result->images ?? []) . "\n\n";
$highQualityConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
imageQuality: 100,
preserveImageFormat: true
),
extractImages: true
);
$kreuzberg = new Kreuzberg($highQualityConfig);
$result = $kreuzberg->extractFile('presentation.pdf');
foreach ($result->images ?? [] as $image) {
$filename = sprintf('image_%d_page_%d.%s',
$image->imageIndex,
$image->pageNumber,
$image->format
);
file_put_contents($filename, $image->data);
echo "Saved high-quality image: $filename ({$image->width}x{$image->height})\n";
}
$fastConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: false,
imageQuality: 50
),
extractTables: false
);
$kreuzberg = new Kreuzberg($fastConfig);
$start = microtime(true);
$result = $kreuzberg->extractFile('large_document.pdf');
$elapsed = microtime(true) - $start;
echo "\nFast extraction completed in " . number_format($elapsed, 3) . " seconds\n";
echo "Content length: " . strlen($result->content) . " characters\n";
```

View File

@@ -0,0 +1,71 @@
```php title="pdf_hierarchy_config.php"
<?php
declare(strict_types=1);
/**
* PdfHierarchyConfig - Hierarchy Detection Configuration
*
* Configure PDF document structure analysis and hierarchy detection
* using k-clustering for document organization recognition.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
// Hierarchy detection in PDF options array
$config = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 6,
'include_bbox' => true,
'ocr_coverage_threshold' => 0.8
]
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Hierarchy detection enabled\n";
echo "Content length: " . strlen($result->content) . " characters\n";
// Alternative: Custom hierarchy parameters for complex documents
$advancedConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 12, // More clusters for detailed hierarchy
'include_bbox' => true, // Include bounding box coordinates
'ocr_coverage_threshold' => 0.7 // Higher OCR threshold
]
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('complex_document.pdf');
echo "Advanced hierarchy detection completed\n";
echo "Detected structure preserved in output\n";
// Disabling hierarchy detection for speed
$fastConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: false,
hierarchy: [
'enabled' => false
]
)
);
$kreuzberg = new Kreuzberg($fastConfig);
$result = $kreuzberg->extractFile('simple_document.pdf');
echo "Fast extraction without hierarchy detection\n";
```

View File

@@ -0,0 +1,313 @@
```php title="tesseract_config.php"
<?php
declare(strict_types=1);
/**
* Tesseract OCR Configuration
*
* This example demonstrates advanced Tesseract OCR configuration options
* for fine-tuning OCR performance and accuracy.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
echo "Example 1: Default Tesseract Configuration\n";
echo "==========================================\n";
$config1 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig()
)
);
echo "Default settings:\n";
echo "- PSM: 3 (Fully automatic page segmentation)\n";
echo "- OEM: 3 (Default, based on what's available)\n";
echo "- Table Detection: Disabled\n\n";
echo "Example 2: Different Page Segmentation Modes\n";
echo "============================================\n";
$config2a = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: 6)
)
);
echo "PSM 6 - Uniform block of text:\n";
echo "- Best for: Most documents, clean text blocks\n";
echo "- Use when: Document has clear text structure\n\n";
$config2b = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: 11)
)
);
echo "PSM 11 - Sparse text:\n";
echo "- Best for: Screenshots, signs, sparse documents\n";
echo "- Use when: Text is scattered across the image\n\n";
$config2c = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: 7)
)
);
echo "PSM 7 - Single text line:\n";
echo "- Best for: Single line of text, headers, captions\n";
echo "- Use when: Processing individual text lines\n\n";
$config2d = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(psm: 8)
)
);
echo "PSM 8 - Single word:\n";
echo "- Best for: Individual words, labels\n";
echo "- Use when: Processing single words\n\n";
echo "Example 3: Table Detection\n";
echo "=========================\n";
$config3 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
enableTableDetection: true
)
)
);
$kreuzberg = new Kreuzberg($config3);
$result = $kreuzberg->extractFile('scanned_invoice.pdf');
echo "Table detection enabled\n";
echo "Best for: Forms, invoices, spreadsheets, reports\n";
if (count($result->tables) > 0) {
echo "\nExtracted tables: " . count($result->tables) . "\n";
foreach ($result->tables as $i => $table) {
echo "\nTable " . ($i + 1) . ":\n";
echo $table->markdown . "\n";
}
}
echo "\n\n";
echo "Example 4: Character Whitelisting\n";
echo "=================================\n";
$config4a = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharWhitelist: '0123456789'
)
)
);
echo "Whitelist: '0123456789' (digits only)\n";
echo "Best for: Serial numbers, IDs, numeric codes\n\n";
$config4b = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
)
)
);
echo "Whitelist: Letters and numbers only\n";
echo "Best for: Product codes, alphanumeric IDs\n\n";
$config4c = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharWhitelist: '0123456789$€£¥.,- '
)
)
);
echo "Whitelist: '0123456789$€£¥.,- ' (financial data)\n";
echo "Best for: Invoices, receipts, price lists\n\n";
echo "Example 5: Character Blacklisting\n";
echo "=================================\n";
$config5 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
tesseditCharBlacklist: '|!@#%^&*()'
)
)
);
echo "Blacklist: '|!@#%^&*()'\n";
echo "Use to: Exclude problematic characters that cause OCR errors\n\n";
echo "Example 6: OCR Engine Modes\n";
echo "===========================\n";
$config6a = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(oem: 0)
)
);
echo "OEM 0 - Legacy engine:\n";
echo "- Older, simpler algorithm\n";
echo "- Sometimes better for very low-quality scans\n\n";
$config6b = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(oem: 1)
)
);
echo "OEM 1 - LSTM neural network:\n";
echo "- Modern deep learning approach\n";
echo "- Better accuracy for most documents\n\n";
$config6c = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(oem: 3)
)
);
echo "OEM 3 - Default (recommended):\n";
echo "- Chooses best available engine\n";
echo "- Use this unless you have specific needs\n\n";
echo "Example 7: Complete Invoice Processing Configuration\n";
echo "====================================================\n";
$config7 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true,
tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$€£.,- :#/'
)
)
);
echo "Invoice-optimized configuration:\n";
echo "- PSM 6: Structured text\n";
echo "- Table detection: Enabled\n";
echo "- Character whitelist: Alphanumeric + currency + common symbols\n";
echo "- Best for: Invoices, receipts, financial documents\n\n";
echo "Example 8: Complete Form Processing Configuration\n";
echo "=================================================\n";
$config8 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3,
enableTableDetection: true,
tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- @'
)
)
);
echo "Form-optimized configuration:\n";
echo "- PSM 6: Structured text\n";
echo "- Table detection: Enabled\n";
echo "- Character whitelist: Alphanumeric + common form characters\n";
echo "- Best for: Forms, applications, surveys\n\n";
echo "Example 9: Sparse Text Configuration\n";
echo "====================================\n";
$config9 = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
psm: 11,
oem: 3
)
)
);
echo "Sparse text configuration:\n";
echo "- PSM 11: Find scattered text\n";
echo "- Best for: Screenshots, signs, posters, sparse documents\n\n";
echo "\nAll Page Segmentation Modes:\n";
echo "============================\n";
echo "0 = OSD only (orientation and script detection)\n";
echo "1 = Automatic page segmentation with OSD\n";
echo "2 = Automatic page segmentation (no OSD or OCR)\n";
echo "3 = Fully automatic page segmentation (default)\n";
echo "4 = Single column of variable-sized text\n";
echo "5 = Single uniform block of vertically aligned text\n";
echo "6 = Single uniform block of text (RECOMMENDED)\n";
echo "7 = Single text line\n";
echo "8 = Single word\n";
echo "9 = Single word in a circle\n";
echo "10 = Single character\n";
echo "11 = Sparse text (RECOMMENDED for screenshots)\n";
echo "12 = Sparse text with OSD\n";
echo "13 = Raw line\n";
echo "\n\nOCR Engine Modes:\n";
echo "=================\n";
echo "0 = Legacy engine only\n";
echo "1 = LSTM neural network only\n";
echo "2 = Legacy + LSTM\n";
echo "3 = Default (RECOMMENDED)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "1. Start with PSM 6 and OEM 3 (defaults)\n";
echo "2. Use PSM 11 for sparse/scattered text\n";
echo "3. Enable table detection for structured documents\n";
echo "4. Use character whitelists for constrained input\n";
echo "5. Use blacklists to exclude problem characters\n";
echo "6. Test different PSM values if accuracy is poor\n";
echo "7. Combine with image preprocessing for better results\n";
```