Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/configuration/chunking_config.php
+++ b/docs/snippets/php/configuration/chunking_config.php
@@ -0,0 +1,207 @@
+```php title="chunking_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Text Chunking Configuration
+ *
+ * This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)
+ * applications. Chunking splits long documents into smaller, semantically meaningful segments.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+
+echo "Example 1: Basic Chunking\n";
+echo "=========================\n";
+
+$config1 = new ExtractionConfig(
+    chunking: new ChunkingConfig()
+);
+
+$kreuzberg = new Kreuzberg($config1);
+$result = $kreuzberg->extractFile('long_document.pdf');
+
+if ($result->chunks !== null) {
+    echo "Total chunks: " . count($result->chunks) . "\n";
+    foreach ($result->chunks as $i => $chunk) {
+        echo "\nChunk {$i}:\n";
+        echo "- Text length: {$chunk->metadata->charCount} characters\n";
+        echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";
+        if ($chunk->metadata->firstPage !== null) {
+            echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";
+        }
+    }
+}
+
+echo "\n\n";
+
+echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";
+echo "======================================================================\n";
+
+$config2 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 256,      
+        chunkOverlap: 25,       
+        respectSentences: true, 
+        respectParagraphs: false
+    )
+);
+
+$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
+echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";
+
+echo "Example 3: Large Chunks (More context per chunk)\n";
+echo "================================================\n";
+
+$config3 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 2000,      
+        chunkOverlap: 200,       
+        respectSentences: true,  
+        respectParagraphs: true  
+    )
+);
+
+$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');
+echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";
+
+echo "Example 4: RAG-Optimized Configuration\n";
+echo "=====================================\n";
+
+$config4 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,       
+        chunkOverlap: 50,        
+        respectSentences: true,  
+        respectParagraphs: false 
+    )
+);
+
+$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');
+
+if ($result4->chunks !== null) {
+    echo "Total chunks: " . count($result4->chunks) . "\n";
+
+    $chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);
+    echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";
+    echo "Min chunk size: " . min($chunkSizes) . " characters\n";
+    echo "Max chunk size: " . max($chunkSizes) . " characters\n";
+}
+
+echo "\n\n";
+
+echo "Example 5: Processing Chunks for Vector Database\n";
+echo "================================================\n";
+
+$config5 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,
+        chunkOverlap: 50,
+        respectSentences: true
+    )
+);
+
+$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
+
+if ($result5->chunks !== null) {
+    foreach ($result5->chunks as $i => $chunk) {
+        $documentId = "doc_123";
+        $chunkData = [
+            'document_id' => $documentId,
+            'chunk_index' => $i,
+            'text' => $chunk->content,
+            'char_count' => $chunk->metadata->charCount,
+            'byte_start' => $chunk->metadata->byteStart,
+            'byte_end' => $chunk->metadata->byteEnd,
+            'page_range' => $chunk->metadata->firstPage !== null
+                ? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
+                : null,
+        ];
+
+
+        echo "Prepared chunk {$i} for database insertion\n";
+    }
+}
+
+echo "\n\n";
+
+echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";
+echo "========================================================================\n";
+
+$config6 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        chunkerType: 'markdown',
+        sizing: [
+            'type' => 'tokenizer',
+            'model' => 'Xenova/gpt-4o'
+        ]
+    )
+);
+
+$result6 = (new Kreuzberg($config6))->extractFile('document.md');
+
+if ($result6->chunks !== null) {
+    echo "Total chunks: " . count($result6->chunks) . "\n";
+
+    foreach ($result6->chunks as $i => $chunk) {
+        echo "\nChunk {$i}:\n";
+        echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";
+
+        if (isset($chunk->metadata->headingContext->headings)) {
+            $headings = $chunk->metadata->headingContext->headings;
+            echo "- Headings in context:\n";
+            foreach ($headings as $heading) {
+                echo "  - Level {$heading->level}: {$heading->text}\n";
+            }
+        }
+    }
+}
+
+echo "\n\nChunking Configuration Parameters:\n";
+echo "==================================\n";
+echo "- maxChunkSize: Maximum number of characters per chunk\n";
+echo "- chunkOverlap: Number of overlapping characters between chunks\n";
+echo "- respectSentences: Split at sentence boundaries when possible\n";
+echo "- respectParagraphs: Split at paragraph boundaries when possible\n";
+echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";
+echo "- sizing: Sizing strategy configuration\n";
+echo "  - type: 'character' or 'tokenizer'\n";
+echo "  - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";
+echo "\n\n";
+
+echo "Example 7: Prepend Heading Context\n";
+echo "====================================\n";
+
+$config7 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        chunkerType: 'markdown',
+        prependHeadingContext: true
+    )
+);
+
+$result7 = (new Kreuzberg($config7))->extractFile('document.md');
+
+if ($result7->chunks !== null) {
+    echo "Total chunks: " . count($result7->chunks) . "\n";
+
+    foreach ($result7->chunks as $i => $chunk) {
+        // Each chunk's content is prefixed with its heading breadcrumb,
+        // e.g. "# Section > ## Subsection\n\nActual content..."
+        echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";
+    }
+}
+
+echo "\nBest Practices:\n";
+echo "- Use 256-512 chars for fine-grained retrieval\n";
+echo "- Use 1000-2000 chars for more context\n";
+echo "- Set overlap to ~10% of chunk size\n";
+echo "- Enable respectSentences for better coherence\n";
+echo "- Use markdown chunker for structured documents with headings\n";
+echo "- Use token-based sizing for LLM token budgets\n";
+echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";
+```
--- a/docs/snippets/php/configuration/embedding_config.php
+++ b/docs/snippets/php/configuration/embedding_config.php
@@ -0,0 +1,200 @@
+```php title="embedding_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Embedding Generation Configuration
+ *
+ * This example demonstrates how to configure embedding generation for semantic search
+ * and vector database applications.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+echo "Example 1: Basic Embedding Generation\n";
+echo "=====================================\n";
+
+$config1 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,
+        chunkOverlap: 50
+    ),
+    embedding: new EmbeddingConfig()  
+);
+
+$kreuzberg = new Kreuzberg($config1);
+$result = $kreuzberg->extractFile('document.pdf');
+
+if ($result->chunks !== null) {
+    foreach ($result->chunks as $i => $chunk) {
+        echo "\nChunk {$i}:\n";
+        echo "- Text: " . substr($chunk->text, 0, 50) . "...\n";
+        if ($chunk->embedding !== null) {
+            echo "- Embedding dimension: " . count($chunk->embedding) . "\n";
+            echo "- First 5 values: [" . implode(', ', array_slice($chunk->embedding, 0, 5)) . "...]\n";
+        }
+    }
+}
+
+echo "\n\n";
+
+echo "Example 2: Different Embedding Models\n";
+echo "====================================\n";
+
+$config2a = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',  
+        normalize: true,
+        batchSize: 32
+    )
+);
+
+echo "Model: all-MiniLM-L6-v2\n";
+echo "- Dimensions: 384\n";
+echo "- Speed: Very Fast\n";
+echo "- Use case: General purpose, quick retrieval\n\n";
+
+$config2b = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-mpnet-base-v2',  
+        normalize: true,
+        batchSize: 16  
+    )
+);
+
+echo "Model: all-mpnet-base-v2\n";
+echo "- Dimensions: 768\n";
+echo "- Speed: Medium\n";
+echo "- Use case: Higher quality semantic search\n\n";
+
+echo "Example 3: Normalized vs Non-Normalized Embeddings\n";
+echo "==================================================\n";
+
+$config3a = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true  
+    )
+);
+
+echo "Normalized embeddings:\n";
+echo "- Better for cosine similarity\n";
+echo "- Values in range [-1, 1]\n";
+echo "- Faster similarity computation\n\n";
+
+$config3b = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: false  
+    )
+);
+
+echo "Non-normalized embeddings:\n";
+echo "- Raw model output\n";
+echo "- Useful for specific distance metrics\n\n";
+
+echo "Example 4: Batch Size Configuration\n";
+echo "===================================\n";
+
+$config4a = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true,
+        batchSize: 8  
+    )
+);
+
+echo "Batch size: 8\n";
+echo "- Lower memory usage\n";
+echo "- Slower processing\n";
+echo "- Good for limited resources\n\n";
+
+$config4b = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true,
+        batchSize: 64  
+    )
+);
+
+echo "Batch size: 64\n";
+echo "- Higher memory usage\n";
+echo "- Faster processing\n";
+echo "- Good for high-performance systems\n\n";
+
+echo "Example 5: Complete RAG Pipeline\n";
+echo "================================\n";
+
+$config5 = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,
+        chunkOverlap: 50,
+        respectSentences: true
+    ),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true,
+        batchSize: 32
+    )
+);
+
+$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
+
+if ($result5->chunks !== null) {
+    echo "Processing " . count($result5->chunks) . " chunks with embeddings...\n\n";
+
+    $vectorDbData = [];
+    foreach ($result5->chunks as $i => $chunk) {
+        if ($chunk->embedding !== null) {
+            $vectorDbData[] = [
+                'id' => "chunk_{$i}",
+                'text' => $chunk->text,
+                'embedding' => $chunk->embedding,
+                'metadata' => [
+                    'char_count' => $chunk->metadata->charCount,
+                    'page_range' => $chunk->metadata->firstPage !== null
+                        ? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
+                        : null,
+                ],
+            ];
+        }
+    }
+
+    echo "Prepared " . count($vectorDbData) . " vectors for database\n";
+    echo "Each vector has " . count($vectorDbData[0]['embedding']) . " dimensions\n";
+}
+
+echo "\n\nEmbedding Configuration Parameters:\n";
+echo "===================================\n";
+echo "- model: Embedding model name\n";
+echo "  * 'all-MiniLM-L6-v2': 384 dims, fast, general purpose\n";
+echo "  * 'all-mpnet-base-v2': 768 dims, higher quality\n";
+echo "- normalize: L2 normalize embeddings (recommended: true)\n";
+echo "- batchSize: Number of chunks to process at once\n";
+echo "\nBest Practices:\n";
+echo "- Use normalized embeddings for cosine similarity\n";
+echo "- Choose batch size based on available memory\n";
+echo "- Use all-MiniLM-L6-v2 for speed, all-mpnet-base-v2 for quality\n";
+echo "- Combine with chunking for optimal RAG performance\n";
+
+echo "\n\nCommon Embedding Models:\n";
+echo "========================\n";
+echo "Model                     | Dimensions | Speed    | Use Case\n";
+echo "--------------------------|------------|----------|---------------------------\n";
+echo "all-MiniLM-L6-v2         | 384        | Fast     | General purpose, QA\n";
+echo "all-mpnet-base-v2        | 768        | Medium   | Better semantic search\n";
+echo "paraphrase-MiniLM-L6-v2  | 384        | Fast     | Paraphrase detection\n";
+echo "paraphrase-mpnet-base-v2 | 768        | Medium   | High-quality paraphrase\n";
+```
--- a/docs/snippets/php/configuration/extraction_config.php
+++ b/docs/snippets/php/configuration/extraction_config.php
@@ -0,0 +1,65 @@
+```php title="extraction_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * ExtractionConfig - Main Configuration
+ *
+ * The ExtractionConfig class is the primary configuration object that controls
+ * all aspects of document extraction. It can be passed to the Kreuzberg constructor
+ * or to individual extraction methods.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\PdfConfig;
+
+$config = new ExtractionConfig(
+    extractImages: true,
+    extractTables: true,
+    preserveFormatting: false
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Extracted with images: " . count($result->images ?? []) . "\n";
+echo "Extracted with tables: " . count($result->tables) . "\n\n";
+
+$advancedConfig = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'
+    ),
+    pdf: new PdfConfig(
+        extractImages: true,
+        imageQuality: 95
+    ),
+    extractImages: true,
+    extractTables: true,
+    preserveFormatting: true,
+    outputFormat: 'markdown'
+);
+
+$kreuzberg = new Kreuzberg($advancedConfig);
+$result = $kreuzberg->extractFile('complex_document.pdf');
+
+echo "Advanced extraction complete\n";
+echo "Content format: " . ($advancedConfig->outputFormat ?? 'plain') . "\n";
+echo "Formatting preserved: " . ($advancedConfig->preserveFormatting ? 'Yes' : 'No') . "\n";
+
+$defaultConfig = new ExtractionConfig(extractTables: false);
+$kreuzberg = new Kreuzberg($defaultConfig);
+
+$result1 = $kreuzberg->extractFile('doc1.pdf');
+
+$overrideConfig = new ExtractionConfig(extractTables: true);
+$result2 = $kreuzberg->extractFile('doc2.pdf', config: $overrideConfig);
+
+echo "\nDoc1 tables: " . count($result1->tables) . "\n";
+echo "Doc2 tables: " . count($result2->tables) . "\n";
+```
--- a/docs/snippets/php/configuration/image_extraction_config.php
+++ b/docs/snippets/php/configuration/image_extraction_config.php
@@ -0,0 +1,277 @@
+```php title="image_extraction_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Image Extraction Configuration
+ *
+ * This example demonstrates how to configure image extraction from documents,
+ * including size filtering and OCR on extracted images.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ImageExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+
+echo "Example 1: Basic Image Extraction\n";
+echo "=================================\n";
+
+$config1 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config1);
+$result = $kreuzberg->extractFile('presentation.pptx');
+
+if ($result->images !== null) {
+    echo "Total images extracted: " . count($result->images) . "\n";
+    foreach ($result->images as $i => $image) {
+        echo "\nImage {$i}:\n";
+        echo "- Format: {$image->format}\n";
+        echo "- Size: {$image->width}x{$image->height} pixels\n";
+        echo "- Page: {$image->pageNumber}\n";
+        echo "- Data size: " . strlen($image->data) . " bytes\n";
+    }
+}
+
+echo "\n\n";
+
+echo "Example 2: Image Extraction with Size Filter\n";
+echo "============================================\n";
+
+$config2 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 200,    
+        minHeight: 200    
+    )
+);
+
+$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
+
+echo "Filtering images smaller than 200x200 pixels\n";
+if ($result2->images !== null) {
+    echo "Filtered images: " . count($result2->images) . "\n";
+}
+
+echo "\n\n";
+
+echo "Example 3: Extract Only Large Images\n";
+echo "====================================\n";
+
+$config3 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 800,    
+        minHeight: 600
+    )
+);
+
+echo "Configured to extract images >= 800x600 pixels\n";
+echo "Good for: Photos, large diagrams, full-page scans\n\n";
+
+echo "Example 4: Extract All Images (Including Thumbnails)\n";
+echo "===================================================\n";
+
+$config4 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 50,     
+        minHeight: 50
+    )
+);
+
+echo "Configured to extract images >= 50x50 pixels\n";
+echo "Good for: Extracting all images including icons and thumbnails\n\n";
+
+echo "Example 5: Image Extraction with OCR\n";
+echo "====================================\n";
+
+$config5 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        performOcr: true,  
+        minWidth: 100,
+        minHeight: 100
+    ),
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'
+    )
+);
+
+$result5 = (new Kreuzberg($config5))->extractFile('document_with_images.pdf');
+
+if ($result5->images !== null) {
+    echo "Extracted " . count($result5->images) . " images with OCR:\n\n";
+
+    foreach ($result5->images as $i => $image) {
+        echo "Image {$i} (Page {$image->pageNumber}):\n";
+        echo "- Size: {$image->width}x{$image->height}\n";
+
+        if ($image->ocrResult !== null) {
+            echo "- OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
+            echo "- OCR Text Length: " . strlen($image->ocrResult->content) . " characters\n";
+        }
+        echo "\n";
+    }
+}
+
+echo "\n\n";
+
+echo "Example 6: Extract and Save Images to Disk\n";
+echo "=========================================\n";
+
+$config6 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 200,
+        minHeight: 200
+    )
+);
+
+$result6 = (new Kreuzberg($config6))->extractFile('presentation.pptx');
+
+if ($result6->images !== null) {
+    $outputDir = 'extracted_images';
+    if (!is_dir($outputDir)) {
+        mkdir($outputDir, 0755, true);
+    }
+
+    foreach ($result6->images as $i => $image) {
+        $filename = "{$outputDir}/image_{$i}_page_{$image->pageNumber}.{$image->format}";
+
+        $imageData = base64_decode($image->data);
+        file_put_contents($filename, $imageData);
+
+        echo "Saved: {$filename} ({$image->width}x{$image->height})\n";
+    }
+}
+
+echo "\n\n";
+
+echo "Example 7: File Type-Specific Image Extraction\n";
+echo "==============================================\n";
+
+$pdfConfig = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 300,
+        minHeight: 300,
+        performOcr: false  
+    )
+);
+
+$pptxConfig = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 100,     
+        minHeight: 100,
+        performOcr: false
+    )
+);
+
+$imageConfig = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        performOcr: true,  
+        minWidth: 50,
+        minHeight: 50
+    ),
+    ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
+);
+
+echo "PDF Configuration:\n";
+echo "- Min size: 300x300 (larger images only)\n";
+echo "- OCR: Disabled (PDFs have embedded text)\n\n";
+
+echo "PowerPoint Configuration:\n";
+echo "- Min size: 100x100 (include icons/logos)\n";
+echo "- OCR: Disabled\n\n";
+
+echo "Image File Configuration:\n";
+echo "- Min size: 50x50 (all images)\n";
+echo "- OCR: Enabled\n\n";
+
+echo "Example 8: Complete Image Processing Pipeline\n";
+echo "=============================================\n";
+
+$config8 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        performOcr: true,
+        minWidth: 200,
+        minHeight: 200
+    ),
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'
+    )
+);
+
+$result8 = (new Kreuzberg($config8))->extractFile('mixed_content.pdf');
+
+if ($result8->images !== null) {
+    echo "Extracted images: " . count($result8->images) . "\n\n";
+
+    foreach ($result8->images as $i => $image) {
+        echo "Processing Image {$i}:\n";
+
+        $isValid = $image->width >= 200 && $image->height >= 200;
+        echo "- Valid size: " . ($isValid ? 'Yes' : 'No') . "\n";
+
+        $filename = "image_{$i}.{$image->format}";
+        file_put_contents($filename, base64_decode($image->data));
+        echo "- Saved: {$filename}\n";
+
+        if ($image->ocrResult !== null) {
+            $ocrText = trim($image->ocrResult->content);
+            if (!empty($ocrText)) {
+                echo "- OCR text available: " . strlen($ocrText) . " characters\n";
+                file_put_contents("image_{$i}_ocr.txt", $ocrText);
+            }
+        }
+
+        $metadata = [
+            'format' => $image->format,
+            'width' => $image->width,
+            'height' => $image->height,
+            'page' => $image->pageNumber,
+            'aspect_ratio' => round($image->width / $image->height, 2),
+        ];
+        file_put_contents("image_{$i}_metadata.json", json_encode($metadata, JSON_PRETTY_PRINT));
+
+        echo "- Metadata saved\n\n";
+    }
+}
+
+echo "\nImage Extraction Configuration Parameters:\n";
+echo "==========================================\n";
+echo "- extractImages: Enable image extraction (default: false)\n";
+echo "- performOcr: Run OCR on extracted images (default: false)\n";
+echo "- minWidth: Minimum image width in pixels (default: 100)\n";
+echo "- minHeight: Minimum image height in pixels (default: 100)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "- Set minWidth/minHeight to filter out unwanted small images\n";
+echo "- Use 200x200 as a good default for meaningful images\n";
+echo "- Use 800x600+ for large photos and diagrams only\n";
+echo "- Use 50x50 to include all images including icons\n";
+echo "- Enable performOcr only when images contain text\n";
+echo "- Combine with OCR config for multilingual text in images\n";
+echo "- Save images to disk for further processing\n";
+
+echo "\n\nCommon Use Cases:\n";
+echo "=================\n";
+echo "1. Extract photos from reports: minWidth=800, minHeight=600\n";
+echo "2. Extract all graphics: minWidth=100, minHeight=100\n";
+echo "3. OCR on images: performOcr=true + OcrConfig\n";
+echo "4. Extract logos/icons: minWidth=50, minHeight=50\n";
+```
--- a/docs/snippets/php/configuration/image_preprocessing_config.php
+++ b/docs/snippets/php/configuration/image_preprocessing_config.php
@@ -0,0 +1,302 @@
+```php title="image_preprocessing_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Image Preprocessing Configuration
+ *
+ * This example demonstrates image preprocessing options to improve OCR accuracy.
+ * Preprocessing can significantly enhance text recognition quality for poor-quality scans.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\ImagePreprocessingConfig;
+
+echo "Example 1: Default Image Preprocessing\n";
+echo "======================================\n";
+
+$config1 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig()
+    )
+);
+
+echo "Default preprocessing settings:\n";
+echo "- Target DPI: 300 (standard for OCR)\n";
+echo "- Auto-rotate: Enabled\n";
+echo "- Denoise: Disabled\n\n";
+
+echo "Example 2: High DPI Configuration\n";
+echo "=================================\n";
+
+$config2 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 600  
+        )
+    )
+);
+
+echo "Target DPI: 600\n";
+echo "Best for:\n";
+echo "- Very small text\n";
+echo "- High-quality scans\n";
+echo "- Documents with fine details\n";
+echo "Note: Higher DPI = slower processing, more memory\n\n";
+
+echo "Example 3: Lower DPI for Speed\n";
+echo "==============================\n";
+
+$config3 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 150  
+        )
+    )
+);
+
+echo "Target DPI: 150\n";
+echo "Best for:\n";
+echo "- Large text\n";
+echo "- Low-resolution images\n";
+echo "- Fast processing needed\n";
+echo "Note: May reduce accuracy for small text\n\n";
+
+echo "Example 4: Manual Rotation Control\n";
+echo "==================================\n";
+
+$config4 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            autoRotate: false  
+        )
+    )
+);
+
+echo "Auto-rotate: Disabled\n";
+echo "Use when:\n";
+echo "- Images are already correctly oriented\n";
+echo "- Auto-rotation causes issues\n";
+echo "- Processing time is critical\n\n";
+
+echo "Example 5: Denoising for Poor Quality Scans\n";
+echo "===========================================\n";
+
+$config5 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 300,
+            autoRotate: true,
+            denoise: true  
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config5);
+$result = $kreuzberg->extractFile('noisy_scan.pdf');
+
+echo "Denoising: Enabled\n";
+echo "Best for:\n";
+echo "- Poor quality scans\n";
+echo "- Fax documents\n";
+echo "- Images with background noise\n";
+echo "- Old or damaged documents\n";
+echo "\nExtracted text length: " . strlen($result->content) . " characters\n\n";
+
+echo "Example 6: Maximum Quality Configuration\n";
+echo "========================================\n";
+
+$config6 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 600,     
+            autoRotate: true,   
+            denoise: true       
+        )
+    )
+);
+
+echo "Maximum quality preprocessing:\n";
+echo "- Target DPI: 600 (high quality)\n";
+echo "- Auto-rotate: Enabled\n";
+echo "- Denoise: Enabled\n";
+echo "\nBest for:\n";
+echo "- Very poor quality scans\n";
+echo "- Historical documents\n";
+echo "- Faded or damaged text\n";
+echo "- Critical accuracy requirements\n\n";
+
+echo "Example 7: Fast Processing Configuration\n";
+echo "========================================\n";
+
+$config7 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 200,     
+            autoRotate: false,  
+            denoise: false      
+        )
+    )
+);
+
+echo "Fast processing configuration:\n";
+echo "- Target DPI: 200 (faster)\n";
+echo "- Auto-rotate: Disabled\n";
+echo "- Denoise: Disabled\n";
+echo "\nBest for:\n";
+echo "- High-volume processing\n";
+echo "- Good quality source images\n";
+echo "- Performance-critical applications\n\n";
+
+echo "Example 8: DPI Recommendations by Document Type\n";
+echo "===============================================\n";
+
+$standardConfig = new ImagePreprocessingConfig(targetDpi: 300);
+echo "Standard documents (letters, reports): 300 DPI\n";
+
+$newspaperConfig = new ImagePreprocessingConfig(targetDpi: 400);
+echo "Newspapers and magazines: 400 DPI\n";
+
+$bookConfig = new ImagePreprocessingConfig(targetDpi: 600);
+echo "Books with small text: 600 DPI\n";
+
+$receiptConfig = new ImagePreprocessingConfig(targetDpi: 300);
+echo "Receipts and forms: 300 DPI\n";
+
+$businessCardConfig = new ImagePreprocessingConfig(targetDpi: 400);
+echo "Business cards: 400 DPI\n";
+
+$faxConfig = new ImagePreprocessingConfig(
+    targetDpi: 300,
+    denoise: true  
+);
+echo "Faxes: 300 DPI + denoising\n\n";
+
+echo "Example 9: Adaptive Configuration by Image Quality\n";
+echo "==================================================\n";
+
+function getPreprocessingConfig(string $quality): ImagePreprocessingConfig
+{
+    return match ($quality) {
+        'excellent' => new ImagePreprocessingConfig(
+            targetDpi: 300,
+            autoRotate: false,
+            denoise: false
+        ),
+        'good' => new ImagePreprocessingConfig(
+            targetDpi: 300,
+            autoRotate: true,
+            denoise: false
+        ),
+        'fair' => new ImagePreprocessingConfig(
+            targetDpi: 400,
+            autoRotate: true,
+            denoise: true
+        ),
+        'poor' => new ImagePreprocessingConfig(
+            targetDpi: 600,
+            autoRotate: true,
+            denoise: true
+        ),
+        default => new ImagePreprocessingConfig(),
+    };
+}
+
+echo "Quality-based configurations:\n\n";
+
+echo "Excellent Quality:\n";
+echo "- DPI: 300, Auto-rotate: No, Denoise: No\n";
+echo "- Clean scans, properly oriented\n\n";
+
+echo "Good Quality:\n";
+echo "- DPI: 300, Auto-rotate: Yes, Denoise: No\n";
+echo "- May need rotation correction\n\n";
+
+echo "Fair Quality:\n";
+echo "- DPI: 400, Auto-rotate: Yes, Denoise: Yes\n";
+echo "- Some noise or quality issues\n\n";
+
+echo "Poor Quality:\n";
+echo "- DPI: 600, Auto-rotate: Yes, Denoise: Yes\n";
+echo "- Significant quality problems\n\n";
+
+echo "Example 10: Complete OCR Pipeline with Preprocessing\n";
+echo "===================================================\n";
+
+$config10 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        imagePreprocessing: new ImagePreprocessingConfig(
+            targetDpi: 300,
+            autoRotate: true,
+            denoise: true
+        )
+    )
+);
+
+$result10 = (new Kreuzberg($config10))->extractFile('poor_quality_scan.pdf');
+
+echo "Processing pipeline:\n";
+echo "1. Load image\n";
+echo "2. Auto-detect orientation and rotate if needed\n";
+echo "3. Upscale/downscale to 300 DPI\n";
+echo "4. Apply denoising filter\n";
+echo "5. Perform OCR\n";
+echo "\nResults:\n";
+echo "- Extracted text: " . strlen($result10->content) . " characters\n";
+echo "- Pages: " . ($result10->metadata->pageCount ?? 'N/A') . "\n";
+
+echo "\n\nImage Preprocessing Parameters:\n";
+echo "================================\n";
+echo "- targetDpi: Target resolution in dots per inch\n";
+echo "  * 150 DPI: Fast, lower quality\n";
+echo "  * 300 DPI: Standard, good balance (RECOMMENDED)\n";
+echo "  * 400 DPI: Better for small text\n";
+echo "  * 600 DPI: Best quality, slower\n";
+echo "\n";
+echo "- autoRotate: Automatically detect and correct orientation\n";
+echo "  * true: Recommended for most cases\n";
+echo "  * false: Skip if images are already oriented\n";
+echo "\n";
+echo "- denoise: Apply noise reduction filter\n";
+echo "  * true: Recommended for poor quality scans\n";
+echo "  * false: Skip for clean images (faster)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "1. Start with 300 DPI as a baseline\n";
+echo "2. Enable auto-rotate unless you know images are correct\n";
+echo "3. Enable denoising for poor quality documents\n";
+echo "4. Use higher DPI (400-600) for small text\n";
+echo "5. Use lower DPI (150-200) when speed is critical\n";
+echo "6. Test different settings to find optimal balance\n";
+echo "7. Consider source quality when choosing settings\n";
+echo "8. Remember: Higher quality = slower processing + more memory\n";
+
+echo "\n\nPerformance vs Quality Trade-offs:\n";
+echo "==================================\n";
+echo "Fastest:  DPI=150, AutoRotate=No,  Denoise=No\n";
+echo "Balanced: DPI=300, AutoRotate=Yes, Denoise=No  (RECOMMENDED)\n";
+echo "Quality:  DPI=400, AutoRotate=Yes, Denoise=Yes\n";
+echo "Maximum:  DPI=600, AutoRotate=Yes, Denoise=Yes\n";
+```
--- a/docs/snippets/php/configuration/keyword_config.php
+++ b/docs/snippets/php/configuration/keyword_config.php
@@ -0,0 +1,115 @@
+```php title="keyword_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * KeywordConfig - Keyword Extraction
+ *
+ * Automatically extract keywords and key phrases from documents.
+ * Useful for document categorization, search indexing, and summarization.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\KeywordConfig;
+
+$config = new ExtractionConfig(
+    keyword: new KeywordConfig(
+        maxKeywords: 10,
+        minScore: 0.0,
+        language: 'en'
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('article.pdf');
+
+echo "Top Keywords:\n";
+echo str_repeat('=', 40) . "\n";
+foreach ($result->metadata->keywords ?? [] as $keyword) {
+    echo "  • $keyword\n";
+}
+echo "\n";
+
+$detailedConfig = new ExtractionConfig(
+    keyword: new KeywordConfig(
+        maxKeywords: 25,
+        minScore: 0.0,
+        language: 'en'
+    )
+);
+
+$kreuzberg = new Kreuzberg($detailedConfig);
+$result = $kreuzberg->extractFile('research_paper.pdf');
+
+echo "Detailed keyword analysis:\n";
+echo "Total keywords: " . count($result->metadata->keywords ?? []) . "\n";
+
+if (!empty($result->metadata->keywords)) {
+    $grouped = [];
+    foreach ($result->metadata->keywords as $keyword) {
+        $first = strtoupper($keyword[0]);
+        if (!isset($grouped[$first])) {
+            $grouped[$first] = [];
+        }
+        $grouped[$first][] = $keyword;
+    }
+
+    foreach ($grouped as $letter => $keywords) {
+        echo "\n$letter:\n";
+        foreach ($keywords as $keyword) {
+            echo "  - $keyword\n";
+        }
+    }
+}
+
+$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
+$allKeywords = [];
+
+foreach ($files as $file) {
+    if (!file_exists($file)) continue;
+
+    $result = $kreuzberg->extractFile($file);
+    foreach ($result->metadata->keywords ?? [] as $keyword) {
+        if (!isset($allKeywords[$keyword])) {
+            $allKeywords[$keyword] = 0;
+        }
+        $allKeywords[$keyword]++;
+    }
+}
+
+arsort($allKeywords);
+echo "\n\nMost common keywords across documents:\n";
+$count = 0;
+foreach ($allKeywords as $keyword => $frequency) {
+    if ($count++ >= 10) break;
+    echo sprintf("  %2d. %-30s (appears in %d documents)\n",
+        $count, $keyword, $frequency);
+}
+
+$categoryKeywords = [
+    'technology' => ['software', 'computer', 'algorithm', 'data', 'system'],
+    'business' => ['market', 'revenue', 'sales', 'customer', 'profit'],
+    'science' => ['research', 'experiment', 'hypothesis', 'analysis', 'study'],
+];
+
+$docKeywords = $result->metadata->keywords ?? [];
+$scores = [];
+
+foreach ($categoryKeywords as $category => $terms) {
+    $score = 0;
+    foreach ($terms as $term) {
+        if (in_array($term, $docKeywords, true)) {
+            $score++;
+        }
+    }
+    $scores[$category] = $score;
+}
+
+arsort($scores);
+$topCategory = array_key_first($scores);
+echo "\nDocument category: $topCategory (score: {$scores[$topCategory]})\n";
+```
--- a/docs/snippets/php/configuration/language_detection_config.php
+++ b/docs/snippets/php/configuration/language_detection_config.php
@@ -0,0 +1,97 @@
+```php title="language_detection_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * LanguageDetectionConfig - Language Detection
+ *
+ * Automatically detect the languages present in a document.
+ * Useful for multilingual documents and routing to appropriate OCR languages.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\LanguageDetectionConfig;
+
+$config = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('multilingual.pdf');
+
+echo "Detected languages:\n";
+foreach ($result->detectedLanguages ?? [] as $lang) {
+    echo "  - $lang\n";
+}
+echo "\n";
+
+$advancedConfig = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true,
+        maxLanguages: 3,           
+        confidenceThreshold: 0.8   
+    )
+);
+
+$kreuzberg = new Kreuzberg($advancedConfig);
+$result = $kreuzberg->extractFile('document.pdf');
+
+if (!empty($result->detectedLanguages)) {
+    echo "High-confidence languages detected:\n";
+    echo implode(', ', $result->detectedLanguages) . "\n\n";
+} else {
+    echo "No languages detected with sufficient confidence\n\n";
+}
+
+use Kreuzberg\Config\OcrConfig;
+
+$detectConfig = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(enabled: true)
+);
+
+$kreuzberg = new Kreuzberg($detectConfig);
+$result = $kreuzberg->extractFile('scanned.pdf');
+
+if (!empty($result->detectedLanguages)) {
+    $primaryLanguage = $result->detectedLanguages[0];
+    echo "Primary language detected: $primaryLanguage\n";
+    echo "Re-processing with OCR optimized for $primaryLanguage...\n";
+
+    $ocrConfig = new ExtractionConfig(
+        ocr: new OcrConfig(
+            backend: 'tesseract',
+            language: $primaryLanguage
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($ocrConfig);
+    $result = $kreuzberg->extractFile('scanned.pdf');
+    echo "OCR extraction complete\n";
+}
+
+$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
+$languageMap = [];
+
+foreach ($files as $file) {
+    if (!file_exists($file)) continue;
+
+    $result = $kreuzberg->extractFile($file);
+    $lang = $result->detectedLanguages[0] ?? 'unknown';
+
+    if (!isset($languageMap[$lang])) {
+        $languageMap[$lang] = [];
+    }
+    $languageMap[$lang][] = $file;
+}
+
+echo "\nDocuments grouped by language:\n";
+foreach ($languageMap as $lang => $docs) {
+    echo "$lang: " . implode(', ', $docs) . "\n";
+}
+```
--- a/docs/snippets/php/configuration/ocr_config.php
+++ b/docs/snippets/php/configuration/ocr_config.php
@@ -0,0 +1,205 @@
+```php title="ocr_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * OCR Configuration
+ *
+ * This example demonstrates how to configure OCR (Optical Character Recognition)
+ * for extracting text from scanned documents and images.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\TesseractConfig;
+
+echo "Example 1: Basic OCR Configuration\n";
+echo "==================================\n";
+
+$config1 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'  
+    )
+);
+
+$kreuzberg = new Kreuzberg($config1);
+$result = $kreuzberg->extractFile('scanned_document.pdf');
+echo "Extracted text length: " . strlen($result->content) . " characters\n\n";
+
+echo "Example 2: Multi-Language OCR\n";
+echo "=============================\n";
+
+$config2 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng+fra+deu'  
+    )
+);
+
+echo "Configured for languages: English, French, German\n";
+echo "Use this for multilingual documents\n\n";
+
+echo "Example 3: Language-Specific OCR\n";
+echo "================================\n";
+
+$config3a = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'spa')
+);
+
+$config3b = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'fra')
+);
+
+$config3c = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'deu')
+);
+
+$config3d = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'chi_sim')
+);
+
+$config3e = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'chi_tra')
+);
+
+$config3f = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'jpn')
+);
+
+$config3g = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'kor')
+);
+
+$config3h = new ExtractionConfig(
+    ocr: new OcrConfig(backend: 'tesseract', language: 'ara')
+);
+
+echo "Common Tesseract Language Codes:\n";
+echo "- eng: English\n";
+echo "- fra: French\n";
+echo "- deu: German\n";
+echo "- spa: Spanish\n";
+echo "- ita: Italian\n";
+echo "- por: Portuguese\n";
+echo "- rus: Russian\n";
+echo "- chi_sim: Chinese (Simplified)\n";
+echo "- chi_tra: Chinese (Traditional)\n";
+echo "- jpn: Japanese\n";
+echo "- kor: Korean\n";
+echo "- ara: Arabic\n\n";
+
+echo "Example 4: Advanced Tesseract Configuration\n";
+echo "==========================================\n";
+
+$config4 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,                     
+            oem: 3,                     
+            enableTableDetection: true  
+        )
+    )
+);
+
+echo "Tesseract Configuration:\n";
+echo "- PSM (Page Segmentation Mode): 6 (uniform text block)\n";
+echo "- OEM (OCR Engine Mode): 3 (LSTM only)\n";
+echo "- Table Detection: Enabled\n\n";
+
+echo "Example 5: OCR for Forms and Invoices\n";
+echo "=====================================\n";
+
+$config5 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,                      
+            oem: 3,                      
+            enableTableDetection: true,  
+            tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$.,- '
+        )
+    )
+);
+
+echo "Optimized for forms and invoices:\n";
+echo "- Table detection enabled\n";
+echo "- Character whitelist for common form characters\n\n";
+
+echo "Example 6: OCR for Numeric Documents\n";
+echo "====================================\n";
+
+$config6 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            oem: 3,
+            tesseditCharWhitelist: '0123456789$.,- '  
+        )
+    )
+);
+
+echo "Character whitelist: '0123456789$.,- '\n";
+echo "Best for: Invoices, receipts, financial documents\n\n";
+
+echo "Example 7: OCR with Character Blacklist\n";
+echo "=======================================\n";
+
+$config7 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            oem: 3,
+            tesseditCharBlacklist: '|!@#%^&*()'  
+        )
+    )
+);
+
+echo "Character blacklist: '|!@#%^&*()'\n";
+echo "Use to exclude problematic characters\n\n";
+
+echo "\nPage Segmentation Modes (PSM):\n";
+echo "==============================\n";
+echo "0  = Orientation and script detection (OSD) only\n";
+echo "1  = Automatic page segmentation with OSD\n";
+echo "2  = Automatic page segmentation (no OSD or OCR)\n";
+echo "3  = Fully automatic page segmentation (default)\n";
+echo "4  = Assume a single column of text of variable sizes\n";
+echo "5  = Assume a single uniform block of vertically aligned text\n";
+echo "6  = Assume a single uniform block of text (recommended for most)\n";
+echo "7  = Treat the image as a single text line\n";
+echo "8  = Treat the image as a single word\n";
+echo "9  = Treat the image as a single word in a circle\n";
+echo "10 = Treat the image as a single character\n";
+echo "11 = Sparse text. Find as much text as possible\n";
+echo "12 = Sparse text with OSD\n";
+echo "13 = Raw line. Treat the image as a single text line\n";
+
+echo "\n\nOCR Engine Modes (OEM):\n";
+echo "======================\n";
+echo "0 = Legacy engine only\n";
+echo "1 = Neural nets LSTM engine only\n";
+echo "2 = Legacy + LSTM engines\n";
+echo "3 = Default, based on what is available (recommended)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "- Use PSM 6 for general documents\n";
+echo "- Use PSM 11 for sparse text (screenshots, signs)\n";
+echo "- Use OEM 3 (default) for best results\n";
+echo "- Enable table detection for structured documents\n";
+echo "- Use character whitelists for forms/invoices\n";
+echo "- Combine multiple languages with '+' separator\n";
+echo "- Preprocess images for better accuracy (see image_preprocessing.php)\n";
+```
--- a/docs/snippets/php/configuration/page_config.php
+++ b/docs/snippets/php/configuration/page_config.php
@@ -0,0 +1,82 @@
+```php title="page_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PageConfig - Page-Level Extraction
+ *
+ * Configure per-page content extraction and page markers for maintaining
+ * document structure in the extracted text.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PageConfig;
+
+$config = new ExtractionConfig(
+    page: new PageConfig(
+        extractPages: false,
+        insertPageMarkers: true,
+        markerFormat: '--- Page {page_number} ---'
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('report.pdf');
+
+echo "Content with page markers:\n";
+echo str_repeat('=', 60) . "\n";
+echo $result->content . "\n\n";
+
+$pageConfig = new ExtractionConfig(
+    page: new PageConfig(
+        extractPages: true,
+        insertPageMarkers: false
+    )
+);
+
+$kreuzberg = new Kreuzberg($pageConfig);
+$result = $kreuzberg->extractFile('multi_page.pdf');
+
+foreach ($result->pages ?? [] as $page) {
+    echo "Page {$page->pageNumber}:\n";
+    echo str_repeat('-', 60) . "\n";
+    echo substr($page->content, 0, 200) . "...\n";
+    echo "Tables on page: " . count($page->tables) . "\n";
+    echo "Images on page: " . count($page->images) . "\n\n";
+}
+
+$customConfig = new ExtractionConfig(
+    page: new PageConfig(
+        extractPages: false,
+        insertPageMarkers: true,
+        markerFormat: "\n\n========== PAGE {page_number} ==========\n\n"
+    )
+);
+
+$kreuzberg = new Kreuzberg($customConfig);
+$result = $kreuzberg->extractFile('document.pdf');
+
+$pages = preg_split('/={10} PAGE \d+ ={10}/', $result->content);
+echo "Split into " . count($pages) . " sections\n";
+
+$allPagesConfig = new ExtractionConfig(
+    page: new PageConfig(extractPages: true)
+);
+
+$kreuzberg = new Kreuzberg($allPagesConfig);
+$result = $kreuzberg->extractFile('large_doc.pdf');
+
+$selectedPages = array_filter(
+    $result->pages ?? [],
+    fn($page) => $page->pageNumber >= 10 && $page->pageNumber <= 20
+);
+
+echo "\nSelected pages 10-20:\n";
+foreach ($selectedPages as $page) {
+    echo "Page {$page->pageNumber}: " . strlen($page->content) . " chars\n";
+}
+```
--- a/docs/snippets/php/configuration/pdf_config.php
+++ b/docs/snippets/php/configuration/pdf_config.php
@@ -0,0 +1,70 @@
+```php title="pdf_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PdfConfig - PDF-Specific Configuration
+ *
+ * Configure PDF extraction behavior including image quality, text extraction
+ * methods, and performance optimization.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PdfConfig;
+
+$config = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: true,
+        imageQuality: 85,
+        preserveImageFormat: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "PDF extraction complete\n";
+echo "Images extracted: " . count($result->images ?? []) . "\n\n";
+
+$highQualityConfig = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: true,
+        imageQuality: 100,  
+        preserveImageFormat: true
+    ),
+    extractImages: true
+);
+
+$kreuzberg = new Kreuzberg($highQualityConfig);
+$result = $kreuzberg->extractFile('presentation.pdf');
+
+foreach ($result->images ?? [] as $image) {
+    $filename = sprintf('image_%d_page_%d.%s',
+        $image->imageIndex,
+        $image->pageNumber,
+        $image->format
+    );
+    file_put_contents($filename, $image->data);
+    echo "Saved high-quality image: $filename ({$image->width}x{$image->height})\n";
+}
+
+$fastConfig = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: false,  
+        imageQuality: 50       
+    ),
+    extractTables: false  
+);
+
+$kreuzberg = new Kreuzberg($fastConfig);
+$start = microtime(true);
+$result = $kreuzberg->extractFile('large_document.pdf');
+$elapsed = microtime(true) - $start;
+
+echo "\nFast extraction completed in " . number_format($elapsed, 3) . " seconds\n";
+echo "Content length: " . strlen($result->content) . " characters\n";
+```
--- a/docs/snippets/php/configuration/pdf_hierarchy_config.php
+++ b/docs/snippets/php/configuration/pdf_hierarchy_config.php
@@ -0,0 +1,71 @@
+```php title="pdf_hierarchy_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PdfHierarchyConfig - Hierarchy Detection Configuration
+ *
+ * Configure PDF document structure analysis and hierarchy detection
+ * using k-clustering for document organization recognition.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PdfConfig;
+
+// Hierarchy detection in PDF options array
+$config = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: true,
+        hierarchy: [
+            'enabled' => true,
+            'k_clusters' => 6,
+            'include_bbox' => true,
+            'ocr_coverage_threshold' => 0.8
+        ]
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Hierarchy detection enabled\n";
+echo "Content length: " . strlen($result->content) . " characters\n";
+
+// Alternative: Custom hierarchy parameters for complex documents
+$advancedConfig = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: true,
+        hierarchy: [
+            'enabled' => true,
+            'k_clusters' => 12,           // More clusters for detailed hierarchy
+            'include_bbox' => true,       // Include bounding box coordinates
+            'ocr_coverage_threshold' => 0.7  // Higher OCR threshold
+        ]
+    )
+);
+
+$kreuzberg = new Kreuzberg($advancedConfig);
+$result = $kreuzberg->extractFile('complex_document.pdf');
+
+echo "Advanced hierarchy detection completed\n";
+echo "Detected structure preserved in output\n";
+
+// Disabling hierarchy detection for speed
+$fastConfig = new ExtractionConfig(
+    pdf: new PdfConfig(
+        extractImages: false,
+        hierarchy: [
+            'enabled' => false
+        ]
+    )
+);
+
+$kreuzberg = new Kreuzberg($fastConfig);
+$result = $kreuzberg->extractFile('simple_document.pdf');
+
+echo "Fast extraction without hierarchy detection\n";
+```
--- a/docs/snippets/php/configuration/tesseract_config.php
+++ b/docs/snippets/php/configuration/tesseract_config.php
@@ -0,0 +1,313 @@
+```php title="tesseract_config.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Tesseract OCR Configuration
+ *
+ * This example demonstrates advanced Tesseract OCR configuration options
+ * for fine-tuning OCR performance and accuracy.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\TesseractConfig;
+
+echo "Example 1: Default Tesseract Configuration\n";
+echo "==========================================\n";
+
+$config1 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig()  
+    )
+);
+
+echo "Default settings:\n";
+echo "- PSM: 3 (Fully automatic page segmentation)\n";
+echo "- OEM: 3 (Default, based on what's available)\n";
+echo "- Table Detection: Disabled\n\n";
+
+echo "Example 2: Different Page Segmentation Modes\n";
+echo "============================================\n";
+
+$config2a = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(psm: 6)
+    )
+);
+
+echo "PSM 6 - Uniform block of text:\n";
+echo "- Best for: Most documents, clean text blocks\n";
+echo "- Use when: Document has clear text structure\n\n";
+
+$config2b = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(psm: 11)
+    )
+);
+
+echo "PSM 11 - Sparse text:\n";
+echo "- Best for: Screenshots, signs, sparse documents\n";
+echo "- Use when: Text is scattered across the image\n\n";
+
+$config2c = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(psm: 7)
+    )
+);
+
+echo "PSM 7 - Single text line:\n";
+echo "- Best for: Single line of text, headers, captions\n";
+echo "- Use when: Processing individual text lines\n\n";
+
+$config2d = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(psm: 8)
+    )
+);
+
+echo "PSM 8 - Single word:\n";
+echo "- Best for: Individual words, labels\n";
+echo "- Use when: Processing single words\n\n";
+
+echo "Example 3: Table Detection\n";
+echo "=========================\n";
+
+$config3 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            enableTableDetection: true  
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config3);
+$result = $kreuzberg->extractFile('scanned_invoice.pdf');
+
+echo "Table detection enabled\n";
+echo "Best for: Forms, invoices, spreadsheets, reports\n";
+
+if (count($result->tables) > 0) {
+    echo "\nExtracted tables: " . count($result->tables) . "\n";
+    foreach ($result->tables as $i => $table) {
+        echo "\nTable " . ($i + 1) . ":\n";
+        echo $table->markdown . "\n";
+    }
+}
+
+echo "\n\n";
+
+echo "Example 4: Character Whitelisting\n";
+echo "=================================\n";
+
+$config4a = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            tesseditCharWhitelist: '0123456789'  
+        )
+    )
+);
+
+echo "Whitelist: '0123456789' (digits only)\n";
+echo "Best for: Serial numbers, IDs, numeric codes\n\n";
+
+$config4b = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
+        )
+    )
+);
+
+echo "Whitelist: Letters and numbers only\n";
+echo "Best for: Product codes, alphanumeric IDs\n\n";
+
+$config4c = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            tesseditCharWhitelist: '0123456789$€£¥.,- '
+        )
+    )
+);
+
+echo "Whitelist: '0123456789$€£¥.,- ' (financial data)\n";
+echo "Best for: Invoices, receipts, price lists\n\n";
+
+echo "Example 5: Character Blacklisting\n";
+echo "=================================\n";
+
+$config5 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            tesseditCharBlacklist: '|!@#%^&*()'  
+        )
+    )
+);
+
+echo "Blacklist: '|!@#%^&*()'\n";
+echo "Use to: Exclude problematic characters that cause OCR errors\n\n";
+
+echo "Example 6: OCR Engine Modes\n";
+echo "===========================\n";
+
+$config6a = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(oem: 0)
+    )
+);
+
+echo "OEM 0 - Legacy engine:\n";
+echo "- Older, simpler algorithm\n";
+echo "- Sometimes better for very low-quality scans\n\n";
+
+$config6b = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(oem: 1)
+    )
+);
+
+echo "OEM 1 - LSTM neural network:\n";
+echo "- Modern deep learning approach\n";
+echo "- Better accuracy for most documents\n\n";
+
+$config6c = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(oem: 3)
+    )
+);
+
+echo "OEM 3 - Default (recommended):\n";
+echo "- Chooses best available engine\n";
+echo "- Use this unless you have specific needs\n\n";
+
+echo "Example 7: Complete Invoice Processing Configuration\n";
+echo "====================================================\n";
+
+$config7 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,                      
+            oem: 3,                      
+            enableTableDetection: true,  
+            tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$€£.,- :#/'
+        )
+    )
+);
+
+echo "Invoice-optimized configuration:\n";
+echo "- PSM 6: Structured text\n";
+echo "- Table detection: Enabled\n";
+echo "- Character whitelist: Alphanumeric + currency + common symbols\n";
+echo "- Best for: Invoices, receipts, financial documents\n\n";
+
+echo "Example 8: Complete Form Processing Configuration\n";
+echo "=================================================\n";
+
+$config8 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 6,
+            oem: 3,
+            enableTableDetection: true,
+            tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- @'
+        )
+    )
+);
+
+echo "Form-optimized configuration:\n";
+echo "- PSM 6: Structured text\n";
+echo "- Table detection: Enabled\n";
+echo "- Character whitelist: Alphanumeric + common form characters\n";
+echo "- Best for: Forms, applications, surveys\n\n";
+
+echo "Example 9: Sparse Text Configuration\n";
+echo "====================================\n";
+
+$config9 = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            psm: 11,  
+            oem: 3
+        )
+    )
+);
+
+echo "Sparse text configuration:\n";
+echo "- PSM 11: Find scattered text\n";
+echo "- Best for: Screenshots, signs, posters, sparse documents\n\n";
+
+echo "\nAll Page Segmentation Modes:\n";
+echo "============================\n";
+echo "0  = OSD only (orientation and script detection)\n";
+echo "1  = Automatic page segmentation with OSD\n";
+echo "2  = Automatic page segmentation (no OSD or OCR)\n";
+echo "3  = Fully automatic page segmentation (default)\n";
+echo "4  = Single column of variable-sized text\n";
+echo "5  = Single uniform block of vertically aligned text\n";
+echo "6  = Single uniform block of text (RECOMMENDED)\n";
+echo "7  = Single text line\n";
+echo "8  = Single word\n";
+echo "9  = Single word in a circle\n";
+echo "10 = Single character\n";
+echo "11 = Sparse text (RECOMMENDED for screenshots)\n";
+echo "12 = Sparse text with OSD\n";
+echo "13 = Raw line\n";
+
+echo "\n\nOCR Engine Modes:\n";
+echo "=================\n";
+echo "0 = Legacy engine only\n";
+echo "1 = LSTM neural network only\n";
+echo "2 = Legacy + LSTM\n";
+echo "3 = Default (RECOMMENDED)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "1. Start with PSM 6 and OEM 3 (defaults)\n";
+echo "2. Use PSM 11 for sparse/scattered text\n";
+echo "3. Enable table detection for structured documents\n";
+echo "4. Use character whitelists for constrained input\n";
+echo "5. Use blacklists to exclude problem characters\n";
+echo "6. Test different PSM values if accuracy is poor\n";
+echo "7. Combine with image preprocessing for better results\n";
+```