Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/utils/chunking.php
+++ b/docs/snippets/php/utils/chunking.php
@@ -0,0 +1,48 @@
+```php title="chunking.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Text Chunking Configuration
+ *
+ * Configure document chunking for processing long texts into manageable pieces.
+ * Useful for RAG systems, embedding generation, and token limit management.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChars: 1500,
+        maxOverlap: 200,
+        embedding: new EmbeddingConfig(
+            model: 'balanced'
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Chunking Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Total chunks created: " . count($result->chunks ?? []) . "\n\n";
+
+foreach ($result->chunks ?? [] as $index => $chunk) {
+    echo "Chunk " . ($index + 1) . ":\n";
+    echo "  Length: " . strlen($chunk->content) . " characters\n";
+    echo "  Preview: " . substr($chunk->content, 0, 100) . "...\n";
+
+    if ($chunk->embedding !== null) {
+        echo "  Embedding dimensions: " . count($chunk->embedding) . "\n";
+    }
+
+    echo "\n";
+}
+```
--- a/docs/snippets/php/utils/chunking_rag.php
+++ b/docs/snippets/php/utils/chunking_rag.php
@@ -0,0 +1,80 @@
+```php title="chunking_rag.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Chunking for RAG (Retrieval-Augmented Generation)
+ *
+ * Advanced chunking configuration optimized for RAG systems with embeddings.
+ * Demonstrates how to process documents into chunks with embeddings for
+ * vector database storage and semantic search.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChars: 500,
+        maxOverlap: 50,
+        embedding: new EmbeddingConfig(
+            model: 'balanced',
+            normalize: true,
+            batchSize: 16
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('research_paper.pdf');
+
+echo "RAG Chunking Results:\n";
+echo str_repeat('=', 60) . "\n";
+
+$chunksWithEmbeddings = [];
+foreach ($result->chunks ?? [] as $chunk) {
+    if ($chunk->embedding !== null) {
+        $chunksWithEmbeddings[] = [
+            'content' => substr($chunk->content, 0, 100) . '...',
+            'embedding_dims' => count($chunk->embedding),
+            'full_content' => $chunk->content,
+            'embedding' => $chunk->embedding,
+        ];
+    }
+}
+
+echo "Chunks with embeddings: " . count($chunksWithEmbeddings) . "\n\n";
+
+echo "Sample chunks for vector database:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach (array_slice($chunksWithEmbeddings, 0, 3) as $index => $chunk) {
+    echo "Chunk " . ($index + 1) . ":\n";
+    echo "  Content preview: {$chunk['content']}\n";
+    echo "  Embedding dimensions: {$chunk['embedding_dims']}\n";
+    echo "  Ready for vector DB: Yes\n\n";
+}
+
+$vectorDbRecords = array_map(
+    fn($chunk, $idx) => [
+        'id' => sprintf('doc_%s_chunk_%d', md5('research_paper.pdf'), $idx),
+        'content' => $chunk['full_content'],
+        'embedding' => $chunk['embedding'],
+        'metadata' => [
+            'source' => 'research_paper.pdf',
+            'chunk_index' => $idx,
+            'char_count' => strlen($chunk['full_content']),
+        ],
+    ],
+    $chunksWithEmbeddings,
+    array_keys($chunksWithEmbeddings)
+);
+
+echo "Prepared " . count($vectorDbRecords) . " records for vector database\n";
+echo "Each record contains: id, content, embedding, and metadata\n";
+```
--- a/docs/snippets/php/utils/embedding_with_chunking.php
+++ b/docs/snippets/php/utils/embedding_with_chunking.php
@@ -0,0 +1,81 @@
+```php title="embedding_with_chunking.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Embedding Generation with Chunking
+ *
+ * Configure chunking with automatic embedding generation for each chunk.
+ * Ideal for semantic search, similarity matching, and vector databases.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChars: 1024,
+        maxOverlap: 100,
+        embedding: new EmbeddingConfig(
+            model: 'balanced',
+            normalize: true,
+            batchSize: 32,
+            showDownloadProgress: false
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Embedding Generation Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
+
+$chunksWithEmbeddings = 0;
+$totalEmbeddingDimensions = 0;
+
+foreach ($result->chunks ?? [] as $chunk) {
+    if ($chunk->embedding !== null) {
+        $chunksWithEmbeddings++;
+        $totalEmbeddingDimensions = count($chunk->embedding);
+    }
+}
+
+echo "Chunks with embeddings: $chunksWithEmbeddings\n";
+echo "Embedding dimensions: $totalEmbeddingDimensions\n";
+echo "Coverage: " . ($chunksWithEmbeddings > 0
+    ? sprintf("%.1f%%", ($chunksWithEmbeddings / count($result->chunks ?? [1])) * 100)
+    : "0%") . "\n\n";
+
+if (!empty($result->chunks) && $result->chunks[0]->embedding !== null) {
+    $sampleChunk = $result->chunks[0];
+
+    echo "Sample Chunk:\n";
+    echo str_repeat('=', 60) . "\n";
+    echo "Content preview: " . substr($sampleChunk->content, 0, 150) . "...\n";
+    echo "Content length: " . strlen($sampleChunk->content) . " chars\n";
+    echo "Embedding dimensions: " . count($sampleChunk->embedding) . "\n";
+    echo "First 5 embedding values: [";
+    echo implode(', ', array_map(
+        fn($v) => number_format($v, 4),
+        array_slice($sampleChunk->embedding, 0, 5)
+    ));
+    echo ", ...]\n\n";
+}
+
+if (!empty($result->chunks)) {
+    $totalChars = array_sum(array_map(
+        fn($chunk) => strlen($chunk->content),
+        $result->chunks
+    ));
+    $avgChunkSize = $totalChars / count($result->chunks);
+
+    echo "Average chunk size: " . round($avgChunkSize) . " characters\n";
+}
+```
--- a/docs/snippets/php/utils/error_handling.php
+++ b/docs/snippets/php/utils/error_handling.php
@@ -0,0 +1,114 @@
+```php title="error_handling.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Comprehensive Error Handling
+ *
+ * Demonstrate proper error handling for document extraction operations.
+ * Shows how to catch and handle different types of Kreuzberg exceptions.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Exceptions\KreuzbergException;
+use Kreuzberg\Exceptions\ParsingException;
+use Kreuzberg\Exceptions\OcrException;
+use Kreuzberg\Exceptions\ValidationException;
+
+$kreuzberg = new Kreuzberg();
+
+try {
+    $result = $kreuzberg->extractFile('document.pdf');
+    echo "Extracted " . strlen($result->content) . " characters\n";
+} catch (ParsingException $e) {
+    echo "Failed to parse document: " . $e->getMessage() . "\n";
+    echo "Error code: " . $e->getCode() . "\n";
+} catch (OcrException $e) {
+    echo "OCR processing failed: " . $e->getMessage() . "\n";
+    echo "Suggestion: Check if document is scanned and OCR is properly configured\n";
+} catch (KreuzbergException $e) {
+    echo "Extraction error: " . $e->getMessage() . "\n";
+    if ($e->getPrevious() !== null) {
+        echo "Caused by: " . $e->getPrevious()->getMessage() . "\n";
+    }
+}
+
+try {
+    $config = new ExtractionConfig();
+    $pdfBytes = file_get_contents('sample.pdf');
+
+    if ($pdfBytes === false) {
+        throw new \RuntimeException('Failed to read file');
+    }
+
+    $result = $kreuzberg->extractBytes($pdfBytes, 'application/pdf', $config);
+    echo "Extracted from bytes: " . substr($result->content, 0, 100) . "...\n";
+} catch (ValidationException $e) {
+    echo "Invalid configuration or input: " . $e->getMessage() . "\n";
+    echo "Details: " . $e->getFile() . " at line " . $e->getLine() . "\n";
+} catch (OcrException $e) {
+    echo "OCR failed: " . $e->getMessage() . "\n";
+} catch (KreuzbergException $e) {
+    echo "Extraction failed: " . $e->getMessage() . "\n";
+} catch (\RuntimeException $e) {
+    echo "File system error: " . $e->getMessage() . "\n";
+}
+
+$files = ['doc1.pdf', 'corrupted.pdf', 'doc3.docx'];
+$successfulExtractions = [];
+$failedExtractions = [];
+
+foreach ($files as $file) {
+    try {
+        $result = $kreuzberg->extractFile($file);
+        $successfulExtractions[$file] = $result;
+        echo "Success: $file\n";
+    } catch (KreuzbergException $e) {
+        $failedExtractions[$file] = [
+            'error' => $e->getMessage(),
+            'type' => get_class($e),
+        ];
+        echo "Failed: $file - " . $e->getMessage() . "\n";
+    }
+}
+
+echo "\nResults:\n";
+echo "Successful: " . count($successfulExtractions) . "\n";
+echo "Failed: " . count($failedExtractions) . "\n";
+
+function extractWithRetry(
+    Kreuzberg $kreuzberg,
+    string $file,
+    int $maxRetries = 3
+): ?\Kreuzberg\Result\ExtractionResult {
+    $attempt = 0;
+
+    while ($attempt < $maxRetries) {
+        try {
+            return $kreuzberg->extractFile($file);
+        } catch (OcrException $e) {
+            $attempt++;
+            if ($attempt >= $maxRetries) {
+                echo "OCR failed after $maxRetries attempts: " . $e->getMessage() . "\n";
+                return null;
+            }
+            echo "OCR attempt $attempt failed, retrying...\n";
+            sleep(1);
+        } catch (KreuzbergException $e) {
+            echo "Fatal error (no retry): " . $e->getMessage() . "\n";
+            return null;
+        }
+    }
+
+    return null;
+}
+
+$result = extractWithRetry($kreuzberg, 'difficult_scan.pdf');
+if ($result !== null) {
+    echo "Successfully extracted with retry: " . strlen($result->content) . " chars\n";
+}
+```
--- a/docs/snippets/php/utils/error_handling_extract.php
+++ b/docs/snippets/php/utils/error_handling_extract.php
@@ -0,0 +1,160 @@
+```php title="error_handling_extract.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Error Handling for HTTP/API Extraction
+ *
+ * Demonstrate error handling when using Kreuzberg extraction via HTTP API.
+ * Shows how to properly handle HTTP errors and API response errors.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use GuzzleHttp\Client;
+use GuzzleHttp\Exception\RequestException;
+use GuzzleHttp\Exception\ClientException;
+use GuzzleHttp\Exception\ServerException;
+
+/**
+ * Extract document via HTTP API with error handling
+ *
+ * @param string $filePath Path to the document file
+ * @param string $apiUrl API endpoint URL
+ * @return array|null Extraction results or null on error
+ */
+function extractViaApi(string $filePath, string $apiUrl = 'http://localhost:8000/extract'): ?array
+{
+    $client = new Client([
+        'timeout' => 30.0,
+        'connect_timeout' => 5.0,
+    ]);
+
+    try {
+        if (!file_exists($filePath)) {
+            throw new \RuntimeException("File not found: $filePath");
+        }
+
+        $response = $client->post($apiUrl, [
+            'multipart' => [
+                [
+                    'name' => 'files',
+                    'contents' => fopen($filePath, 'r'),
+                    'filename' => basename($filePath),
+                ],
+            ],
+        ]);
+
+        $results = json_decode($response->getBody()->getContents(), true);
+
+        if (json_last_error() !== JSON_ERROR_NONE) {
+            throw new \RuntimeException('Invalid JSON response: ' . json_last_error_msg());
+        }
+
+        echo "Success: Extracted " . count($results) . " documents\n";
+        return $results;
+
+    } catch (ClientException $e) {
+        $response = $e->getResponse();
+        $statusCode = $response->getStatusCode();
+        $body = json_decode($response->getBody()->getContents(), true);
+
+        $errorType = $body['error_type'] ?? 'Unknown';
+        $message = $body['message'] ?? 'No message provided';
+
+        echo "Client Error ($statusCode): $errorType\n";
+        echo "Message: $message\n";
+
+        if (isset($body['details'])) {
+            echo "Details: " . json_encode($body['details']) . "\n";
+        }
+
+        return null;
+
+    } catch (ServerException $e) {
+        $response = $e->getResponse();
+        $statusCode = $response->getStatusCode();
+
+        echo "Server Error ($statusCode): " . $e->getMessage() . "\n";
+        echo "The API server encountered an error. Please try again later.\n";
+
+        return null;
+
+    } catch (RequestException $e) {
+        echo "Request Error: " . $e->getMessage() . "\n";
+
+        if ($e->hasResponse()) {
+            echo "Response code: " . $e->getResponse()->getStatusCode() . "\n";
+        } else {
+            echo "No response received - check if the API server is running\n";
+        }
+
+        return null;
+
+    } catch (\RuntimeException $e) {
+        echo "Runtime Error: " . $e->getMessage() . "\n";
+        return null;
+    }
+}
+
+echo "Attempting to extract document via API...\n";
+echo str_repeat('=', 60) . "\n";
+
+$result = extractViaApi('document.pdf');
+
+if ($result !== null) {
+    foreach ($result as $doc) {
+        $contentLength = strlen($doc['content'] ?? '');
+        $mimeType = $doc['mime_type'] ?? 'unknown';
+
+        echo "\nDocument extracted:\n";
+        echo "  Content length: $contentLength characters\n";
+        echo "  MIME type: $mimeType\n";
+
+        if (isset($doc['metadata'])) {
+            echo "  Metadata keys: " . implode(', ', array_keys($doc['metadata'])) . "\n";
+        }
+    }
+} else {
+    echo "\nExtraction failed. Check the error messages above.\n";
+}
+
+function extractWithRetry(
+    string $filePath,
+    string $apiUrl = 'http://localhost:8000/extract',
+    int $maxRetries = 3,
+    float $initialDelay = 1.0
+): ?array {
+    $attempt = 0;
+    $delay = $initialDelay;
+
+    while ($attempt < $maxRetries) {
+        $result = extractViaApi($filePath, $apiUrl);
+
+        if ($result !== null) {
+            return $result;
+        }
+
+        $attempt++;
+        if ($attempt < $maxRetries) {
+            echo "\nRetrying in " . number_format($delay, 1) . " seconds... (Attempt " . ($attempt + 1) . "/$maxRetries)\n";
+            usleep((int)($delay * 1000000));
+            $delay *= 2; 
+        }
+    }
+
+    echo "\nFailed after $maxRetries attempts\n";
+    return null;
+}
+
+echo "\n" . str_repeat('=', 60) . "\n";
+echo "Extracting with retry logic...\n";
+echo str_repeat('=', 60) . "\n";
+
+$resultWithRetry = extractWithRetry('document.pdf', 'http://localhost:8000/extract');
+
+if ($resultWithRetry !== null) {
+    echo "\nSuccessfully extracted with retry mechanism\n";
+}
+```
--- a/docs/snippets/php/utils/image_extraction.php
+++ b/docs/snippets/php/utils/image_extraction.php
@@ -0,0 +1,134 @@
+```php title="image_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Image Extraction from Documents
+ *
+ * Extract embedded images from PDF and other document formats.
+ * Demonstrates saving images, analyzing metadata, and processing image data.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PdfConfig;
+use Kreuzberg\Result\ExtractedImage;
+
+$config = new ExtractionConfig(
+    extractImages: true,
+    pdf: new PdfConfig(
+        extractImages: true,
+        imageQuality: 90
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document_with_images.pdf');
+
+echo "Image Extraction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Total images extracted: " . count($result->images ?? []) . "\n\n";
+
+$outputDir = './extracted_images';
+if (!is_dir($outputDir)) {
+    mkdir($outputDir, 0755, true);
+}
+
+foreach ($result->images ?? [] as $index => $image) {
+    echo "Image " . ($index + 1) . ":\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $filename = sprintf(
+        'page_%d_image_%d.%s',
+        $image->pageNumber ?? 0,
+        $image->imageIndex ?? $index,
+        $image->format ?? 'png'
+    );
+    $filepath = $outputDir . '/' . $filename;
+
+    $bytesWritten = file_put_contents($filepath, $image->data);
+
+    if ($bytesWritten !== false) {
+        echo "  Saved: $filename\n";
+        echo "  Size: {$image->width}x{$image->height} pixels\n";
+        echo "  Format: {$image->format}\n";
+        echo "  File size: " . number_format($bytesWritten) . " bytes\n";
+        echo "  Page: " . ($image->pageNumber ?? 'N/A') . "\n";
+
+        if ($image->width > 0 && $image->height > 0) {
+            $aspectRatio = $image->width / $image->height;
+            echo "  Aspect ratio: " . number_format($aspectRatio, 2) . ":1\n";
+
+            $orientation = $image->width > $image->height ? 'Landscape' : 'Portrait';
+            if (abs($image->width - $image->height) < 10) {
+                $orientation = 'Square';
+            }
+            echo "  Orientation: $orientation\n";
+        }
+
+        echo "\n";
+    } else {
+        echo "  Error: Failed to save image\n\n";
+    }
+}
+
+echo "Image Analysis:\n";
+echo str_repeat('=', 60) . "\n";
+
+if (!empty($result->images)) {
+    $largeImages = array_filter(
+        $result->images,
+        fn(ExtractedImage $img) => $img->width > 800 || $img->height > 800
+    );
+
+    echo "Large images (>800px): " . count($largeImages) . "\n";
+
+    $totalBytes = array_sum(
+        array_map(fn(ExtractedImage $img) => strlen($img->data), $result->images)
+    );
+
+    echo "Total image data: " . number_format($totalBytes / 1024, 2) . " KB\n";
+
+    $formatCounts = [];
+    foreach ($result->images as $image) {
+        $format = $image->format ?? 'unknown';
+        $formatCounts[$format] = ($formatCounts[$format] ?? 0) + 1;
+    }
+
+    echo "\nImages by format:\n";
+    foreach ($formatCounts as $format => $count) {
+        echo "  $format: $count\n";
+    }
+
+    $totalWidth = array_sum(array_map(fn($img) => $img->width, $result->images));
+    $totalHeight = array_sum(array_map(fn($img) => $img->height, $result->images));
+    $imageCount = count($result->images);
+
+    echo "\nAverage dimensions: " .
+        round($totalWidth / $imageCount) . "x" .
+        round($totalHeight / $imageCount) . " pixels\n";
+}
+
+function createThumbnail(ExtractedImage $image, int $maxWidth = 200): ?string
+{
+    if ($image->width <= $maxWidth) {
+        return null; 
+    }
+
+    $scale = $maxWidth / $image->width;
+    $newHeight = (int)($image->height * $scale);
+
+    return "Thumbnail would be: {$maxWidth}x{$newHeight}";
+}
+
+echo "\nThumbnail recommendations:\n";
+foreach ($result->images ?? [] as $index => $image) {
+    $thumbInfo = createThumbnail($image, 200);
+    if ($thumbInfo !== null) {
+        echo "  Image " . ($index + 1) . ": $thumbInfo\n";
+    }
+}
+```
--- a/docs/snippets/php/utils/image_preprocessing.php
+++ b/docs/snippets/php/utils/image_preprocessing.php
@@ -0,0 +1,187 @@
+```php title="image_preprocessing.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Image Preprocessing for OCR
+ *
+ * Configure image preprocessing settings to improve OCR accuracy on scanned documents.
+ * Demonstrates various preprocessing techniques like denoising, deskewing, and contrast enhancement.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\TesseractConfig;
+use Kreuzberg\Config\ImagePreprocessingConfig;
+
+$config = new ExtractionConfig(
+    ocr: new OcrConfig(
+        tesseractConfig: new TesseractConfig(
+            preprocessing: new ImagePreprocessingConfig(
+                targetDpi: 300,
+                denoise: true,
+                deskew: true,
+                contrastEnhance: true,
+                binarizationMethod: 'otsu'
+            )
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('scanned.pdf');
+
+echo "OCR with Image Preprocessing:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Content extracted: " . strlen($result->content) . " characters\n";
+echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
+
+$advancedConfig = new ExtractionConfig(
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            preprocessing: new ImagePreprocessingConfig(
+                targetDpi: 600,          
+                denoise: true,           
+                deskew: true,            
+                contrastEnhance: true,   
+                binarizationMethod: 'adaptive', 
+                sharpen: true,           
+                removeBackground: true   
+            ),
+            pageSegmentationMode: 3,
+            engineMode: 3
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($advancedConfig);
+$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
+
+echo "Advanced Preprocessing Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Content length: " . strlen($result->content) . " characters\n";
+
+if (isset($result->metadata)) {
+    $qualityScore = $result->qualityScore ?? null;
+    $confidence = $result->metadata['ocr_confidence'] ?? null;
+
+    if ($qualityScore !== null) {
+        echo "Quality score: " . number_format($qualityScore, 2) . "\n";
+
+        if ($qualityScore < 0.5) {
+            echo "Warning: Low quality extraction detected\n";
+            echo "Recommendations:\n";
+            echo "  - Increase target DPI (current: 600)\n";
+            echo "  - Try different binarization method\n";
+            echo "  - Consider rescanning the original document\n";
+        }
+    }
+
+    if ($confidence !== null) {
+        echo "OCR confidence: " . number_format($confidence * 100, 1) . "%\n";
+    }
+}
+
+echo "\n";
+
+$preprocessingProfiles = [
+    'basic' => new ImagePreprocessingConfig(
+        targetDpi: 300,
+        denoise: false,
+        deskew: false,
+        contrastEnhance: false
+    ),
+    'balanced' => new ImagePreprocessingConfig(
+        targetDpi: 300,
+        denoise: true,
+        deskew: true,
+        contrastEnhance: true,
+        binarizationMethod: 'otsu'
+    ),
+    'aggressive' => new ImagePreprocessingConfig(
+        targetDpi: 600,
+        denoise: true,
+        deskew: true,
+        contrastEnhance: true,
+        binarizationMethod: 'adaptive',
+        sharpen: true,
+        removeBackground: true
+    ),
+];
+
+echo "Preprocessing Profile Comparison:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($preprocessingProfiles as $profileName => $preprocessing) {
+    $profileConfig = new ExtractionConfig(
+        ocr: new OcrConfig(
+            tesseractConfig: new TesseractConfig(
+                preprocessing: $preprocessing
+            )
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($profileConfig);
+
+    $startTime = microtime(true);
+    $result = $kreuzberg->extractFile('sample_scan.pdf');
+    $elapsedTime = microtime(true) - $startTime;
+
+    echo ucfirst($profileName) . " profile:\n";
+    echo "  Content length: " . strlen($result->content) . " characters\n";
+    echo "  Processing time: " . number_format($elapsedTime, 3) . " seconds\n";
+    echo "  Settings:\n";
+    echo "    - DPI: {$preprocessing->targetDpi}\n";
+    echo "    - Denoise: " . ($preprocessing->denoise ? 'Yes' : 'No') . "\n";
+    echo "    - Deskew: " . ($preprocessing->deskew ? 'Yes' : 'No') . "\n";
+    echo "    - Binarization: " . ($preprocessing->binarizationMethod ?? 'None') . "\n";
+    echo "\n";
+}
+
+function recommendPreprocessingSettings(string $documentType): ImagePreprocessingConfig
+{
+    return match ($documentType) {
+        'modern_scan' => new ImagePreprocessingConfig(
+            targetDpi: 300,
+            denoise: true,
+            deskew: true,
+            contrastEnhance: false,
+            binarizationMethod: 'otsu'
+        ),
+        'old_document' => new ImagePreprocessingConfig(
+            targetDpi: 600,
+            denoise: true,
+            deskew: true,
+            contrastEnhance: true,
+            binarizationMethod: 'adaptive',
+            removeBackground: true
+        ),
+        'newspaper' => new ImagePreprocessingConfig(
+            targetDpi: 400,
+            denoise: true,
+            deskew: true,
+            contrastEnhance: true,
+            binarizationMethod: 'sauvola',
+            removeBackground: true
+        ),
+        default => new ImagePreprocessingConfig(
+            targetDpi: 300,
+            denoise: true,
+            deskew: true,
+            contrastEnhance: true,
+            binarizationMethod: 'otsu'
+        ),
+    };
+}
+
+echo "Recommended preprocessing for old documents:\n";
+$recommended = recommendPreprocessingSettings('old_document');
+echo "  Target DPI: {$recommended->targetDpi}\n";
+echo "  Binarization: {$recommended->binarizationMethod}\n";
+```
--- a/docs/snippets/php/utils/keyword_extraction_example.php
+++ b/docs/snippets/php/utils/keyword_extraction_example.php
@@ -0,0 +1,200 @@
+```php title="keyword_extraction_example.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Keyword Extraction Example
+ *
+ * Extract keywords from documents using various algorithms.
+ * Demonstrates automatic keyword detection for document analysis and indexing.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\KeywordConfig;
+use Kreuzberg\Enums\KeywordAlgorithm;
+
+$config = new ExtractionConfig(
+    keywords: new KeywordConfig(
+        algorithm: KeywordAlgorithm::YAKE,
+        maxKeywords: 10,
+        minScore: 0.3
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('research_paper.pdf');
+
+echo "Keyword Extraction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: research_paper.pdf\n";
+echo "Content length: " . strlen($result->content) . " characters\n\n";
+
+$keywords = $result->metadata['keywords'] ?? [];
+
+if (!empty($keywords)) {
+    echo "Extracted Keywords:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    foreach ($keywords as $keyword) {
+        $text = $keyword['text'] ?? '';
+        $score = $keyword['score'] ?? 0.0;
+        $frequency = $keyword['frequency'] ?? null;
+
+        echo sprintf("  %-30s  Score: %.3f", $text, $score);
+
+        if ($frequency !== null) {
+            echo sprintf("  (appears %d times)", $frequency);
+        }
+
+        echo "\n";
+    }
+    echo "\n";
+} else {
+    echo "No keywords extracted. Try adjusting minScore or maxKeywords.\n\n";
+}
+
+$algorithms = [
+    'YAKE' => KeywordAlgorithm::YAKE,
+    'TextRank' => KeywordAlgorithm::TEXT_RANK,
+    'TF-IDF' => KeywordAlgorithm::TF_IDF,
+];
+
+echo "Algorithm Comparison:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($algorithms as $name => $algorithm) {
+    $algoConfig = new ExtractionConfig(
+        keywords: new KeywordConfig(
+            algorithm: $algorithm,
+            maxKeywords: 5,
+            minScore: 0.2
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($algoConfig);
+    $result = $kreuzberg->extractFile('article.pdf');
+
+    $keywords = $result->metadata['keywords'] ?? [];
+
+    echo "$name algorithm:\n";
+
+    if (!empty($keywords)) {
+        foreach ($keywords as $keyword) {
+            echo "  - {$keyword['text']} ({$keyword['score']})\n";
+        }
+    } else {
+        echo "  No keywords extracted\n";
+    }
+
+    echo "\n";
+}
+
+function categorizeDocument(array $keywords): string
+{
+    $categories = [
+        'technical' => ['algorithm', 'system', 'implementation', 'performance', 'architecture'],
+        'business' => ['revenue', 'market', 'customer', 'strategy', 'investment'],
+        'scientific' => ['research', 'study', 'analysis', 'experiment', 'hypothesis'],
+        'legal' => ['contract', 'agreement', 'liability', 'clause', 'provision'],
+    ];
+
+    $scores = [];
+    foreach ($categories as $category => $terms) {
+        $scores[$category] = 0;
+
+        foreach ($keywords as $keyword) {
+            $keywordText = strtolower($keyword['text'] ?? '');
+            $keywordScore = $keyword['score'] ?? 0.0;
+
+            foreach ($terms as $term) {
+                if (str_contains($keywordText, $term)) {
+                    $scores[$category] += $keywordScore;
+                }
+            }
+        }
+    }
+
+    arsort($scores);
+    $topCategory = array_key_first($scores);
+
+    return $topCategory ?? 'uncategorized';
+}
+
+if (!empty($keywords)) {
+    $category = categorizeDocument($keywords);
+    echo "Document Category: " . ucfirst($category) . "\n\n";
+}
+
+$documents = [
+    'tech_article.pdf',
+    'business_report.pdf',
+    'research_paper.pdf',
+];
+
+$keywordConfig = new ExtractionConfig(
+    keywords: new KeywordConfig(
+        algorithm: KeywordAlgorithm::YAKE,
+        maxKeywords: 8,
+        minScore: 0.25
+    )
+);
+
+$kreuzberg = new Kreuzberg($keywordConfig);
+
+echo "Batch Keyword Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        echo "$document: File not found\n\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+    $keywords = $result->metadata['keywords'] ?? [];
+
+    echo basename($document) . ":\n";
+
+    if (!empty($keywords)) {
+        $topKeywords = array_slice($keywords, 0, 5);
+        $keywordTexts = array_column($topKeywords, 'text');
+        echo "  Top keywords: " . implode(', ', $keywordTexts) . "\n";
+
+        $category = categorizeDocument($keywords);
+        echo "  Category: " . ucfirst($category) . "\n";
+    } else {
+        echo "  No keywords extracted\n";
+    }
+
+    echo "\n";
+}
+
+$keywordIndex = [];
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+    $keywords = $result->metadata['keywords'] ?? [];
+
+    foreach ($keywords as $keyword) {
+        $text = strtolower($keyword['text'] ?? '');
+        if (!isset($keywordIndex[$text])) {
+            $keywordIndex[$text] = [];
+        }
+        $keywordIndex[$text][] = basename($document);
+    }
+}
+
+echo "Keyword Index (for search):\n";
+echo str_repeat('=', 60) . "\n";
+foreach (array_slice($keywordIndex, 0, 10) as $keyword => $docs) {
+    echo "$keyword: " . implode(', ', array_unique($docs)) . "\n";
+}
+```
--- a/docs/snippets/php/utils/language_detection.php
+++ b/docs/snippets/php/utils/language_detection.php
@@ -0,0 +1,197 @@
+```php title="language_detection.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Language Detection
+ *
+ * Automatically detect the language of extracted document content.
+ * Useful for routing documents to language-specific processing pipelines.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\LanguageDetectionConfig;
+
+$config = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true,
+        minConfidence: 0.9,
+        detectMultiple: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Language Detection Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: document.pdf\n";
+echo "Content length: " . strlen($result->content) . " characters\n\n";
+
+$detectedLanguages = $result->detectedLanguages ?? [];
+
+if (!empty($detectedLanguages)) {
+    echo "Detected languages: " . implode(', ', $detectedLanguages) . "\n";
+
+    $primaryLanguage = $detectedLanguages[0];
+    echo "Primary language: $primaryLanguage\n\n";
+
+    if (isset($result->metadata['language_confidence'])) {
+        echo "Language confidence scores:\n";
+        foreach ($result->metadata['language_confidence'] as $lang => $confidence) {
+            echo sprintf("  %-10s: %.1f%%\n", $lang, $confidence * 100);
+        }
+        echo "\n";
+    }
+} else {
+    echo "No language detected or confidence too low.\n";
+    echo "Try lowering minConfidence threshold.\n\n";
+}
+
+if (!empty($detectedLanguages)) {
+    $primaryLanguage = $detectedLanguages[0];
+
+    match ($primaryLanguage) {
+        'en', 'eng' => print("Processing as English document...\n"),
+        'es', 'spa' => print("Processing as Spanish document...\n"),
+        'fr', 'fra' => print("Processing as French document...\n"),
+        'de', 'deu' => print("Processing as German document...\n"),
+        'zh', 'zho' => print("Processing as Chinese document...\n"),
+        default => print("Processing as $primaryLanguage document...\n"),
+    };
+}
+
+echo "\n" . str_repeat('=', 60) . "\n";
+echo "Testing Different Confidence Thresholds:\n";
+echo str_repeat('=', 60) . "\n";
+
+$thresholds = [0.5, 0.7, 0.9, 0.95];
+
+foreach ($thresholds as $threshold) {
+    $thresholdConfig = new ExtractionConfig(
+        languageDetection: new LanguageDetectionConfig(
+            enabled: true,
+            minConfidence: $threshold,
+            detectMultiple: true
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($thresholdConfig);
+    $result = $kreuzberg->extractFile('document.pdf');
+
+    $languages = $result->detectedLanguages ?? [];
+
+    echo sprintf("Threshold %.2f: ", $threshold);
+    if (!empty($languages)) {
+        echo implode(', ', $languages) . "\n";
+    } else {
+        echo "No languages detected\n";
+    }
+}
+
+function getLanguageName(string $code): string
+{
+    $languageNames = [
+        'en' => 'English',
+        'es' => 'Spanish',
+        'fr' => 'French',
+        'de' => 'German',
+        'it' => 'Italian',
+        'pt' => 'Portuguese',
+        'ru' => 'Russian',
+        'zh' => 'Chinese',
+        'ja' => 'Japanese',
+        'ko' => 'Korean',
+        'ar' => 'Arabic',
+        'hi' => 'Hindi',
+        'nl' => 'Dutch',
+        'pl' => 'Polish',
+        'tr' => 'Turkish',
+    ];
+
+    return $languageNames[$code] ?? ucfirst($code);
+}
+
+echo "\n" . str_repeat('=', 60) . "\n";
+echo "Detected Languages (Full Names):\n";
+echo str_repeat('=', 60) . "\n";
+
+if (!empty($detectedLanguages)) {
+    foreach ($detectedLanguages as $langCode) {
+        echo "  - " . getLanguageName($langCode) . " ($langCode)\n";
+    }
+} else {
+    echo "No languages detected.\n";
+}
+
+$documents = [
+    'english_doc.pdf',
+    'spanish_doc.pdf',
+    'german_doc.pdf',
+];
+
+echo "\n" . str_repeat('=', 60) . "\n";
+echo "Batch Language Detection:\n";
+echo str_repeat('=', 60) . "\n";
+
+$detectionConfig = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true,
+        minConfidence: 0.8,
+        detectMultiple: false  
+    )
+);
+
+$kreuzberg = new Kreuzberg($detectionConfig);
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        echo basename($document) . ": File not found\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+    $languages = $result->detectedLanguages ?? [];
+
+    echo basename($document) . ": ";
+
+    if (!empty($languages)) {
+        $primaryLang = $languages[0];
+        echo getLanguageName($primaryLang) . " ($primaryLang)\n";
+    } else {
+        echo "Language not detected\n";
+    }
+}
+
+function routeDocumentByLanguage(string $filePath, array $detectedLanguages): string
+{
+    if (empty($detectedLanguages)) {
+        return 'default_queue';
+    }
+
+    $primaryLanguage = $detectedLanguages[0];
+
+    return match ($primaryLanguage) {
+        'en', 'eng' => 'english_processing_queue',
+        'es', 'spa' => 'spanish_processing_queue',
+        'fr', 'fra' => 'french_processing_queue',
+        'de', 'deu' => 'german_processing_queue',
+        'zh', 'zho', 'ja', 'jpn', 'ko', 'kor' => 'cjk_processing_queue',
+        'ar', 'ara', 'he', 'heb' => 'rtl_processing_queue',
+        default => 'multilingual_queue',
+    };
+}
+
+echo "\n" . str_repeat('=', 60) . "\n";
+echo "Document Routing Based on Language:\n";
+echo str_repeat('=', 60) . "\n";
+
+if (!empty($detectedLanguages)) {
+    $queue = routeDocumentByLanguage('document.pdf', $detectedLanguages);
+    echo "Document routed to: $queue\n";
+}
+```
--- a/docs/snippets/php/utils/language_detection_multilingual.php
+++ b/docs/snippets/php/utils/language_detection_multilingual.php
@@ -0,0 +1,236 @@
+```php title="language_detection_multilingual.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Multilingual Document Language Detection
+ *
+ * Detect multiple languages in documents that contain mixed-language content.
+ * Useful for processing multilingual documents, translations, and international content.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\LanguageDetectionConfig;
+
+$config = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true,
+        minConfidence: 0.7,
+        detectMultiple: true  
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('multilingual_document.pdf');
+
+echo "Multilingual Language Detection:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: multilingual_document.pdf\n\n";
+
+$detectedLanguages = $result->detectedLanguages ?? [];
+$languageCount = count($detectedLanguages);
+
+echo "Detected $languageCount language(s): " . implode(', ', $detectedLanguages) . "\n\n";
+
+if ($languageCount > 1) {
+    echo "This is a multilingual document.\n";
+    echo "Languages present:\n";
+
+    foreach ($detectedLanguages as $index => $language) {
+        $label = $index === 0 ? 'Primary' : 'Secondary';
+        echo "  $label: $language\n";
+    }
+
+    echo "\n";
+} elseif ($languageCount === 1) {
+    echo "This is a monolingual document.\n";
+    echo "Language: {$detectedLanguages[0]}\n\n";
+} else {
+    echo "No languages detected.\n\n";
+}
+
+if (isset($result->metadata['language_distribution'])) {
+    echo "Language Distribution:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    foreach ($result->metadata['language_distribution'] as $lang => $percentage) {
+        $barLength = (int)($percentage * 40);
+        $bar = str_repeat('█', $barLength);
+
+        echo sprintf(
+            "  %-10s [%-40s] %5.1f%%\n",
+            $lang,
+            $bar,
+            $percentage * 100
+        );
+    }
+
+    echo "\n";
+}
+
+function categorizeMultilingualDocument(array $languages): string
+{
+    $count = count($languages);
+
+    if ($count === 0) {
+        return 'unknown';
+    }
+
+    if ($count === 1) {
+        return 'monolingual';
+    }
+
+    if ($count === 2) {
+        sort($languages);
+        $pair = implode('-', $languages);
+
+        $commonPairs = [
+            'en-es' => 'English-Spanish bilingual',
+            'en-fr' => 'English-French bilingual',
+            'en-de' => 'English-German bilingual',
+            'en-zh' => 'English-Chinese bilingual',
+        ];
+
+        return $commonPairs[$pair] ?? 'bilingual';
+    }
+
+    return 'multilingual';
+}
+
+$docType = categorizeMultilingualDocument($detectedLanguages);
+echo "Document type: $docType\n\n";
+
+if ($languageCount > 1) {
+    echo "Multilingual Processing Recommendations:\n";
+    echo str_repeat('=', 60) . "\n";
+
+    echo "1. Consider splitting content by language\n";
+    echo "2. Use language-specific OCR models if available\n";
+    echo "3. Apply appropriate tokenization for each language\n";
+    echo "4. Use multilingual embedding models for semantic search\n\n";
+}
+
+function extractLanguageSections(string $content, array $languages): array
+{
+
+    $sections = [];
+    $lines = explode("\n", $content);
+    $currentLang = $languages[0] ?? 'unknown';
+
+    foreach ($lines as $line) {
+        if (empty(trim($line))) {
+            continue;
+        }
+
+        if (!isset($sections[$currentLang])) {
+            $sections[$currentLang] = [];
+        }
+
+        $sections[$currentLang][] = $line;
+    }
+
+    return $sections;
+}
+
+$testDocuments = [
+    'english_only.pdf',
+    'spanish_english.pdf',
+    'multilingual_eu.pdf',
+];
+
+echo "Batch Multilingual Analysis:\n";
+echo str_repeat('=', 60) . "\n";
+
+$multilingualConfig = new ExtractionConfig(
+    languageDetection: new LanguageDetectionConfig(
+        enabled: true,
+        minConfidence: 0.6,
+        detectMultiple: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($multilingualConfig);
+
+$statistics = [
+    'monolingual' => 0,
+    'bilingual' => 0,
+    'multilingual' => 0,
+];
+
+foreach ($testDocuments as $document) {
+    if (!file_exists($document)) {
+        echo basename($document) . ": File not found\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+    $languages = $result->detectedLanguages ?? [];
+    $type = categorizeMultilingualDocument($languages);
+
+    echo basename($document) . ":\n";
+    echo "  Languages: " . implode(', ', $languages) . "\n";
+    echo "  Type: $type\n\n";
+
+    if (count($languages) === 1) {
+        $statistics['monolingual']++;
+    } elseif (count($languages) === 2) {
+        $statistics['bilingual']++;
+    } elseif (count($languages) > 2) {
+        $statistics['multilingual']++;
+    }
+}
+
+echo "Statistics:\n";
+echo "  Monolingual: {$statistics['monolingual']}\n";
+echo "  Bilingual: {$statistics['bilingual']}\n";
+echo "  Multilingual: {$statistics['multilingual']}\n\n";
+
+function analyzeLanguagePairs(array $documents, Kreuzberg $kreuzberg): array
+{
+    $pairs = [];
+
+    foreach ($documents as $document) {
+        if (!file_exists($document)) {
+            continue;
+        }
+
+        $result = $kreuzberg->extractFile($document);
+        $languages = $result->detectedLanguages ?? [];
+
+        if (count($languages) >= 2) {
+            sort($languages);
+            $pair = implode('-', array_slice($languages, 0, 2));
+
+            if (!isset($pairs[$pair])) {
+                $pairs[$pair] = 0;
+            }
+
+            $pairs[$pair]++;
+        }
+    }
+
+    arsort($pairs);
+    return $pairs;
+}
+
+$translationPairs = [
+    'en-es' => 'English ↔ Spanish',
+    'en-fr' => 'English ↔ French',
+    'en-de' => 'English ↔ German',
+    'en-zh' => 'English ↔ Chinese',
+    'en-ja' => 'English ↔ Japanese',
+];
+
+echo "Common Translation Pairs:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($translationPairs as $code => $name) {
+    echo "  $code: $name\n";
+}
+
+echo "\nUse these configurations for translation document processing.\n";
+```
--- a/docs/snippets/php/utils/quality_processing_example.php
+++ b/docs/snippets/php/utils/quality_processing_example.php
@@ -0,0 +1,203 @@
+```php title="quality_processing_example.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Quality Processing Example
+ *
+ * Enable quality processing to assess and improve extraction quality.
+ * Useful for detecting low-quality scans and suggesting improvements.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+
+$config = new ExtractionConfig(
+    enableQualityProcessing: true
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('scanned_document.pdf');
+
+echo "Quality Processing Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: scanned_document.pdf\n";
+echo "Content length: " . strlen($result->content) . " characters\n\n";
+
+$qualityScore = $result->qualityScore ?? null;
+
+if ($qualityScore !== null) {
+    echo "Quality Score: " . number_format($qualityScore, 2) . "\n";
+    echo "Rating: ";
+
+    if ($qualityScore >= 0.8) {
+        echo "Excellent\n";
+        echo "Status: ✓ Ready for production use\n";
+    } elseif ($qualityScore >= 0.6) {
+        echo "Good\n";
+        echo "Status: ✓ Acceptable quality\n";
+    } elseif ($qualityScore >= 0.5) {
+        echo "Fair\n";
+        echo "Status: ⚠ May require review\n";
+    } else {
+        echo "Poor\n";
+        echo "Status: ✗ Requires attention\n";
+    }
+
+    echo "\n";
+
+    if ($qualityScore < 0.5) {
+        echo "Recommendations for Improvement:\n";
+        echo str_repeat('-', 40) . "\n";
+        echo "1. Re-scan with higher DPI (300+ recommended)\n";
+        echo "2. Ensure original is clean and well-lit\n";
+        echo "3. Adjust OCR preprocessing settings:\n";
+        echo "   - Enable denoising\n";
+        echo "   - Enable deskewing\n";
+        echo "   - Increase contrast enhancement\n";
+        echo "4. Try different binarization methods\n";
+        echo "5. Consider manual review and correction\n\n";
+    }
+} else {
+    echo "Quality score not available.\n";
+    echo "Enable quality processing in configuration.\n\n";
+}
+
+if (isset($result->metadata['ocr_confidence'])) {
+    $ocrConfidence = $result->metadata['ocr_confidence'];
+    echo "OCR Confidence: " . number_format($ocrConfidence * 100, 1) . "%\n\n";
+
+    if ($ocrConfidence < 0.7) {
+        echo "⚠ Low OCR confidence detected.\n";
+        echo "The extracted text may contain errors.\n\n";
+    }
+}
+
+if (isset($result->metadata['quality_metrics'])) {
+    echo "Detailed Quality Metrics:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $metrics = $result->metadata['quality_metrics'];
+
+    foreach ($metrics as $metric => $value) {
+        $formattedValue = is_numeric($value)
+            ? number_format($value, 3)
+            : $value;
+
+        echo sprintf("  %-25s: %s\n", ucwords(str_replace('_', ' ', $metric)), $formattedValue);
+    }
+
+    echo "\n";
+}
+
+$documents = [
+    'high_quality_scan.pdf',
+    'medium_quality_scan.pdf',
+    'low_quality_scan.pdf',
+];
+
+echo "Batch Quality Analysis:\n";
+echo str_repeat('=', 60) . "\n";
+
+$qualityConfig = new ExtractionConfig(
+    enableQualityProcessing: true
+);
+
+$kreuzberg = new Kreuzberg($qualityConfig);
+$qualityResults = [];
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        echo basename($document) . ": File not found\n\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+    $score = $result->qualityScore ?? 0.0;
+
+    $qualityResults[$document] = [
+        'score' => $score,
+        'content_length' => strlen($result->content),
+        'result' => $result,
+    ];
+
+    echo basename($document) . ":\n";
+    echo "  Quality score: " . number_format($score, 2) . "\n";
+    echo "  Content length: " . strlen($result->content) . " chars\n";
+
+    $indicator = match(true) {
+        $score >= 0.8 => '✓ Excellent',
+        $score >= 0.6 => '✓ Good',
+        $score >= 0.5 => '⚠ Fair',
+        default => '✗ Poor',
+    };
+
+    echo "  Status: $indicator\n\n";
+}
+
+if (!empty($qualityResults)) {
+    $scores = array_column($qualityResults, 'score');
+    $avgScore = array_sum($scores) / count($scores);
+    $maxScore = max($scores);
+    $minScore = min($scores);
+
+    echo "Quality Statistics:\n";
+    echo str_repeat('-', 40) . "\n";
+    echo "  Average: " . number_format($avgScore, 2) . "\n";
+    echo "  Highest: " . number_format($maxScore, 2) . "\n";
+    echo "  Lowest:  " . number_format($minScore, 2) . "\n\n";
+
+    $lowQualityDocs = array_filter(
+        $qualityResults,
+        fn($result) => $result['score'] < 0.5
+    );
+
+    if (!empty($lowQualityDocs)) {
+        echo "Documents Requiring Attention:\n";
+        echo str_repeat('-', 40) . "\n";
+
+        foreach ($lowQualityDocs as $doc => $data) {
+            echo "  - " . basename($doc) . " (score: " . number_format($data['score'], 2) . ")\n";
+        }
+
+        echo "\n";
+    }
+}
+
+function needsReprocessing(float $qualityScore, int $contentLength): bool
+{
+    return $qualityScore < 0.5 || $contentLength < 100;
+}
+
+function routeDocumentByQuality(string $filePath, float $qualityScore): string
+{
+    return match(true) {
+        $qualityScore >= 0.8 => 'auto_processing_queue',
+        $qualityScore >= 0.6 => 'standard_review_queue',
+        $qualityScore >= 0.5 => 'detailed_review_queue',
+        default => 'manual_review_queue',
+    };
+}
+
+echo "Document Routing Based on Quality:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($qualityResults as $doc => $data) {
+    $queue = routeDocumentByQuality($doc, $data['score']);
+    $reprocess = needsReprocessing($data['score'], $data['content_length']);
+
+    echo basename($doc) . ":\n";
+    echo "  Route to: $queue\n";
+
+    if ($reprocess) {
+        echo "  Action: Reprocess with enhanced settings\n";
+    } else {
+        echo "  Action: Continue standard workflow\n";
+    }
+
+    echo "\n";
+}
+```
--- a/docs/snippets/php/utils/standalone_embed.md
+++ b/docs/snippets/php/utils/standalone_embed.md
@@ -0,0 +1,20 @@
+```php
+<?php
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\EmbeddingConfig;
+use Kreuzberg\Config\EmbeddingModelType;
+
+$kreuzberg = new Kreuzberg();
+
+// Embed with default config (balanced preset)
+$embeddings = $kreuzberg->embed(["Hello world", "How are you?"]);
+
+// Embed with specific preset
+$config = new EmbeddingConfig(model: EmbeddingModelType::preset("fast"));
+$embeddings = $kreuzberg->embed(["Hello world"], $config);
+
+// Each embedding is a float array
+foreach ($embeddings as $i => $vector) {
+    echo "Text $i: " . count($vector) . " dimensions\n";
+}
+```
--- a/docs/snippets/php/utils/tables.php
+++ b/docs/snippets/php/utils/tables.php
@@ -0,0 +1,237 @@
+```php title="tables.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Table Extraction and Processing
+ *
+ * Extract tables from documents and convert them to various formats.
+ * Demonstrates table processing, formatting, and export capabilities.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Result\ExtractedTable;
+
+$config = new ExtractionConfig(
+    extractTables: true
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Table Extraction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Tables found: " . count($result->tables) . "\n\n";
+
+foreach ($result->tables as $tableIndex => $table) {
+    echo "Table " . ($tableIndex + 1) . ":\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $rowCount = count($table->cells);
+    $colCount = !empty($table->cells) ? count($table->cells[0]) : 0;
+
+    echo "  Dimensions: $rowCount rows × $colCount columns\n";
+
+    if (isset($table->pageNumber)) {
+        echo "  Page: {$table->pageNumber}\n";
+    }
+
+    echo "\n";
+
+    echo "  Markdown representation:\n";
+    echo str_repeat('-', 40) . "\n";
+    echo $table->markdown . "\n\n";
+
+    echo "  Raw data preview:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $previewRows = array_slice($table->cells, 0, 3);
+    foreach ($previewRows as $rowIndex => $row) {
+        echo "  Row " . ($rowIndex + 1) . ": [" . implode(' | ', $row) . "]\n";
+    }
+
+    if ($rowCount > 3) {
+        echo "  ... and " . ($rowCount - 3) . " more rows\n";
+    }
+
+    echo "\n";
+}
+
+echo "Exporting Tables to CSV:\n";
+echo str_repeat('=', 60) . "\n";
+
+$outputDir = './exported_tables';
+if (!is_dir($outputDir)) {
+    mkdir($outputDir, 0755, true);
+}
+
+foreach ($result->tables as $index => $table) {
+    $filename = sprintf('table_%d.csv', $index + 1);
+    $filepath = $outputDir . '/' . $filename;
+
+    $fp = fopen($filepath, 'w');
+
+    if ($fp !== false) {
+        foreach ($table->cells as $row) {
+            fputcsv($fp, $row);
+        }
+
+        fclose($fp);
+        echo "Saved: $filename\n";
+    } else {
+        echo "Error: Failed to create $filename\n";
+    }
+}
+
+echo "\n";
+
+echo "Exporting Tables to JSON:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($result->tables as $index => $table) {
+    $filename = sprintf('table_%d.json', $index + 1);
+    $filepath = $outputDir . '/' . $filename;
+
+    $tableData = [
+        'index' => $index + 1,
+        'page' => $table->pageNumber ?? null,
+        'dimensions' => [
+            'rows' => count($table->cells),
+            'columns' => !empty($table->cells) ? count($table->cells[0]) : 0,
+        ],
+        'data' => $table->cells,
+        'markdown' => $table->markdown,
+    ];
+
+    $json = json_encode($tableData, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
+    file_put_contents($filepath, $json);
+
+    echo "Saved: $filename\n";
+}
+
+echo "\n";
+
+function tableToHtml(ExtractedTable $table): string
+{
+    $html = "<table>\n";
+
+    foreach ($table->cells as $rowIndex => $row) {
+        $html .= "  <tr>\n";
+
+        $tag = $rowIndex === 0 ? 'th' : 'td';
+
+        foreach ($row as $cell) {
+            $escapedCell = htmlspecialchars($cell, ENT_QUOTES, 'UTF-8');
+            $html .= "    <$tag>$escapedCell</$tag>\n";
+        }
+
+        $html .= "  </tr>\n";
+    }
+
+    $html .= "</table>";
+
+    return $html;
+}
+
+echo "Exporting Tables to HTML:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($result->tables as $index => $table) {
+    $filename = sprintf('table_%d.html', $index + 1);
+    $filepath = $outputDir . '/' . $filename;
+
+    $html = "<!DOCTYPE html>\n";
+    $html .= "<html>\n<head>\n";
+    $html .= "  <meta charset=\"UTF-8\">\n";
+    $html .= "  <title>Table " . ($index + 1) . "</title>\n";
+    $html .= "  <style>\n";
+    $html .= "    table { border-collapse: collapse; width: 100%; }\n";
+    $html .= "    th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n";
+    $html .= "    th { background-color: #f2f2f2; }\n";
+    $html .= "  </style>\n";
+    $html .= "</head>\n<body>\n";
+    $html .= "  <h1>Table " . ($index + 1) . "</h1>\n";
+    $html .= tableToHtml($table) . "\n";
+    $html .= "</body>\n</html>";
+
+    file_put_contents($filepath, $html);
+
+    echo "Saved: $filename\n";
+}
+
+echo "\n";
+
+echo "Table Analysis:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($result->tables as $index => $table) {
+    echo "Table " . ($index + 1) . " Analysis:\n";
+
+    $cells = $table->cells;
+    $totalCells = array_sum(array_map('count', $cells));
+    $emptyCells = 0;
+    $numericCells = 0;
+
+    foreach ($cells as $row) {
+        foreach ($row as $cell) {
+            if (empty(trim($cell))) {
+                $emptyCells++;
+            }
+
+            if (is_numeric($cell)) {
+                $numericCells++;
+            }
+        }
+    }
+
+    echo "  Total cells: $totalCells\n";
+    echo "  Empty cells: $emptyCells (" . number_format(($emptyCells / max($totalCells, 1)) * 100, 1) . "%)\n";
+    echo "  Numeric cells: $numericCells (" . number_format(($numericCells / max($totalCells, 1)) * 100, 1) . "%)\n";
+
+    $numericRatio = $numericCells / max($totalCells, 1);
+    $tableType = match(true) {
+        $numericRatio > 0.5 => 'Data/Numeric Table',
+        $numericRatio > 0.2 => 'Mixed Content Table',
+        default => 'Text Table',
+    };
+
+    echo "  Table type: $tableType\n\n";
+}
+
+function tableToAssociativeArray(ExtractedTable $table): array
+{
+    if (empty($table->cells)) {
+        return [];
+    }
+
+    $headers = array_shift($table->cells);
+    $data = [];
+
+    foreach ($table->cells as $row) {
+        $rowData = [];
+        foreach ($headers as $index => $header) {
+            $rowData[$header] = $row[$index] ?? '';
+        }
+        $data[] = $rowData;
+    }
+
+    return $data;
+}
+
+if (!empty($result->tables)) {
+    $firstTable = $result->tables[0];
+    $associativeData = tableToAssociativeArray($firstTable);
+
+    echo "First Table as Associative Array:\n";
+    echo str_repeat('=', 60) . "\n";
+    echo json_encode(array_slice($associativeData, 0, 3), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . "\n";
+
+    if (count($associativeData) > 3) {
+        echo "... and " . (count($associativeData) - 3) . " more records\n";
+    }
+}
+```
--- a/docs/snippets/php/utils/token_reduction.php
+++ b/docs/snippets/php/utils/token_reduction.php
@@ -0,0 +1,169 @@
+```php title="token_reduction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Token Reduction Configuration
+ *
+ * Configure token reduction to compress extracted content while preserving meaning.
+ * Useful for reducing token costs in LLM applications and staying within token limits.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\TokenReductionConfig;
+
+$config = new ExtractionConfig(
+    tokenReduction: new TokenReductionConfig(
+        mode: 'moderate',
+        preserveImportantWords: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Token Reduction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Content length: " . strlen($result->content) . " characters\n\n";
+
+if (isset($result->metadata['original_token_count'])) {
+    $originalTokens = $result->metadata['original_token_count'];
+    $reducedTokens = $result->metadata['token_count'] ?? strlen($result->content);
+    $reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
+
+    echo "Token Reduction Statistics:\n";
+    echo str_repeat('-', 40) . "\n";
+    echo "  Original tokens: " . number_format($originalTokens) . "\n";
+    echo "  Reduced tokens: " . number_format($reducedTokens) . "\n";
+    echo "  Reduction: " . number_format($reductionRatio * 100, 1) . "%\n";
+    echo "  Tokens saved: " . number_format($originalTokens - $reducedTokens) . "\n\n";
+}
+
+$modes = [
+    'light' => 'Light reduction - minimal changes',
+    'moderate' => 'Moderate reduction - balanced',
+    'aggressive' => 'Aggressive reduction - maximum compression',
+];
+
+echo "Token Reduction Mode Comparison:\n";
+echo str_repeat('=', 60) . "\n";
+
+$comparisonResults = [];
+
+foreach ($modes as $mode => $description) {
+    $modeConfig = new ExtractionConfig(
+        tokenReduction: new TokenReductionConfig(
+            mode: $mode,
+            preserveImportantWords: true
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($modeConfig);
+    $result = $kreuzberg->extractFile('sample.pdf');
+
+    $contentLength = strlen($result->content);
+    $tokenCount = $result->metadata['token_count'] ?? $contentLength;
+
+    $comparisonResults[$mode] = [
+        'length' => $contentLength,
+        'tokens' => $tokenCount,
+        'content' => substr($result->content, 0, 100),
+    ];
+
+    echo "$mode mode:\n";
+    echo "  Description: $description\n";
+    echo "  Content length: " . number_format($contentLength) . " characters\n";
+    echo "  Estimated tokens: " . number_format($tokenCount) . "\n";
+    echo "  Preview: " . substr($result->content, 0, 80) . "...\n\n";
+}
+
+if (count($comparisonResults) > 1) {
+    $lightLength = $comparisonResults['light']['length'] ?? 0;
+    $aggressiveLength = $comparisonResults['aggressive']['length'] ?? 0;
+
+    if ($lightLength > 0) {
+        $savings = (($lightLength - $aggressiveLength) / $lightLength) * 100;
+
+        echo "Comparison Summary:\n";
+        echo str_repeat('-', 40) . "\n";
+        echo "Aggressive vs Light mode saves: " . number_format($savings, 1) . "%\n\n";
+    }
+}
+
+$advancedConfig = new ExtractionConfig(
+    tokenReduction: new TokenReductionConfig(
+        mode: 'moderate',
+        preserveImportantWords: true,
+        preserveMarkdown: true,
+        preserveNumbers: true,
+        removeStopWords: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($advancedConfig);
+$result = $kreuzberg->extractFile('verbose_document.pdf');
+
+echo "Advanced Token Reduction:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Configuration:\n";
+echo "  - Preserve important words: Yes\n";
+echo "  - Preserve markdown: Yes\n";
+echo "  - Preserve numbers: Yes\n";
+echo "  - Remove stop words: Yes\n\n";
+
+echo "Result:\n";
+echo "  Content length: " . strlen($result->content) . " characters\n";
+
+if (isset($result->metadata['token_reduction_ratio'])) {
+    echo "  Reduction ratio: " . number_format($result->metadata['token_reduction_ratio'] * 100, 1) . "%\n";
+}
+
+echo "\n";
+
+function estimateTokenCost(int $tokens, float $pricePerMillion = 0.50): float
+{
+    return ($tokens / 1_000_000) * $pricePerMillion;
+}
+
+echo "Cost Estimation (based on reduction):\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($comparisonResults as $mode => $data) {
+    $tokens = $data['tokens'];
+    $cost = estimateTokenCost($tokens);
+
+    echo ucfirst($mode) . " mode:\n";
+    echo "  Tokens: " . number_format($tokens) . "\n";
+    echo "  Estimated cost: $" . number_format($cost, 4) . "\n\n";
+}
+
+function chooseReductionMode(int $maxTokens, int $estimatedTokens): string
+{
+    $ratio = $estimatedTokens / $maxTokens;
+
+    return match(true) {
+        $ratio <= 1.0 => 'none',      
+        $ratio <= 1.3 => 'light',     
+        $ratio <= 1.7 => 'moderate',  
+        default => 'aggressive',      
+    };
+}
+
+$maxTokenLimit = 8000;
+$documentTokens = 12000;
+
+$recommendedMode = chooseReductionMode($maxTokenLimit, $documentTokens);
+
+echo "Reduction Mode Recommendation:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document tokens: " . number_format($documentTokens) . "\n";
+echo "Token limit: " . number_format($maxTokenLimit) . "\n";
+echo "Recommended mode: $recommendedMode\n";
+echo "Reason: " . ($documentTokens > $maxTokenLimit
+    ? "Document exceeds limit by " . number_format($documentTokens - $maxTokenLimit) . " tokens"
+    : "Document within limits") . "\n";
+```
--- a/docs/snippets/php/utils/token_reduction_example.php
+++ b/docs/snippets/php/utils/token_reduction_example.php
@@ -0,0 +1,222 @@
+```php title="token_reduction_example.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Token Reduction Example
+ *
+ * Practical example of using token reduction to fit documents within token limits.
+ * Demonstrates tracking reduction statistics and optimizing for LLM usage.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\TokenReductionConfig;
+
+$config = new ExtractionConfig(
+    tokenReduction: new TokenReductionConfig(
+        mode: 'moderate',
+        preserveMarkdown: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('verbose_document.pdf');
+
+echo "Token Reduction Example:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: verbose_document.pdf\n\n";
+
+if (isset($result->metadata['original_token_count'])) {
+    $originalTokens = $result->metadata['original_token_count'];
+    $reducedTokens = $result->metadata['token_count'];
+    $reductionRatio = $result->metadata['token_reduction_ratio'];
+
+    echo "Token Reduction Statistics:\n";
+    echo str_repeat('-', 40) . "\n";
+    echo sprintf("  Before:    %s tokens\n", number_format($originalTokens));
+    echo sprintf("  After:     %s tokens\n", number_format($reducedTokens));
+    echo sprintf("  Reduction: %.1f%%\n", $reductionRatio * 100);
+    echo sprintf("  Saved:     %s tokens\n\n", number_format($originalTokens - $reducedTokens));
+
+    $beforeBar = str_repeat('█', (int)($originalTokens / 100));
+    $afterBar = str_repeat('█', (int)($reducedTokens / 100));
+
+    echo "Visual comparison (each █ = ~100 tokens):\n";
+    echo "  Before: $beforeBar\n";
+    echo "  After:  $afterBar\n\n";
+}
+
+echo "Content Analysis:\n";
+echo str_repeat('-', 40) . "\n";
+echo "  Content length: " . strlen($result->content) . " characters\n";
+echo "  First 200 chars: " . substr($result->content, 0, 200) . "...\n\n";
+
+$documents = [
+    'long_article.pdf',
+    'research_paper.pdf',
+    'technical_doc.pdf',
+];
+
+echo "Batch Token Reduction:\n";
+echo str_repeat('=', 60) . "\n";
+
+$batchConfig = new ExtractionConfig(
+    tokenReduction: new TokenReductionConfig(
+        mode: 'moderate',
+        preserveImportantWords: true,
+        preserveMarkdown: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($batchConfig);
+$totalOriginal = 0;
+$totalReduced = 0;
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        echo basename($document) . ": File not found\n\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+
+    $originalTokens = $result->metadata['original_token_count'] ?? 0;
+    $reducedTokens = $result->metadata['token_count'] ?? 0;
+    $reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
+
+    $totalOriginal += $originalTokens;
+    $totalReduced += $reducedTokens;
+
+    echo basename($document) . ":\n";
+    echo sprintf("  Original: %s tokens\n", number_format($originalTokens));
+    echo sprintf("  Reduced:  %s tokens\n", number_format($reducedTokens));
+    echo sprintf("  Saved:    %.1f%%\n\n", $reductionRatio * 100);
+}
+
+if ($totalOriginal > 0) {
+    $overallReduction = (($totalOriginal - $totalReduced) / $totalOriginal) * 100;
+
+    echo "Overall Statistics:\n";
+    echo str_repeat('-', 40) . "\n";
+    echo sprintf("  Total original: %s tokens\n", number_format($totalOriginal));
+    echo sprintf("  Total reduced:  %s tokens\n", number_format($totalReduced));
+    echo sprintf("  Overall saving: %.1f%%\n\n", $overallReduction);
+}
+
+function fitWithinTokenLimit(
+    string $filePath,
+    int $maxTokens,
+    Kreuzberg $kreuzberg
+): ?array {
+    $modes = ['light', 'moderate', 'aggressive'];
+
+    foreach ($modes as $mode) {
+        $config = new ExtractionConfig(
+            tokenReduction: new TokenReductionConfig(
+                mode: $mode,
+                preserveImportantWords: true
+            )
+        );
+
+        $kreuzbergWithMode = new Kreuzberg($config);
+        $result = $kreuzbergWithMode->extractFile($filePath);
+
+        $tokens = $result->metadata['token_count'] ?? strlen($result->content);
+
+        if ($tokens <= $maxTokens) {
+            return [
+                'mode' => $mode,
+                'tokens' => $tokens,
+                'result' => $result,
+                'fits' => true,
+            ];
+        }
+    }
+
+    $config = new ExtractionConfig(
+        tokenReduction: new TokenReductionConfig(
+            mode: 'aggressive',
+            preserveImportantWords: true
+        )
+    );
+
+    $kreuzbergWithMode = new Kreuzberg($config);
+    $result = $kreuzbergWithMode->extractFile($filePath);
+    $tokens = $result->metadata['token_count'] ?? strlen($result->content);
+
+    return [
+        'mode' => 'aggressive',
+        'tokens' => $tokens,
+        'result' => $result,
+        'fits' => false,
+    ];
+}
+
+echo "Fitting Document to Token Limit:\n";
+echo str_repeat('=', 60) . "\n";
+
+$tokenLimit = 8000;
+$testFile = 'large_document.pdf';
+
+if (file_exists($testFile)) {
+    $fitResult = fitWithinTokenLimit($testFile, $tokenLimit, $kreuzberg);
+
+    echo "Target limit: " . number_format($tokenLimit) . " tokens\n";
+    echo "Reduction mode used: {$fitResult['mode']}\n";
+    echo "Final token count: " . number_format($fitResult['tokens']) . "\n";
+
+    if ($fitResult['fits']) {
+        echo "Status: ✓ Successfully fits within limit\n";
+        $remaining = $tokenLimit - $fitResult['tokens'];
+        echo "Tokens remaining: " . number_format($remaining) . "\n";
+    } else {
+        echo "Status: ✗ Still exceeds limit\n";
+        $excess = $fitResult['tokens'] - $tokenLimit;
+        echo "Tokens over limit: " . number_format($excess) . "\n";
+        echo "Suggestion: Consider chunking the document\n";
+    }
+
+    echo "\n";
+}
+
+function calculateCostSavings(
+    int $originalTokens,
+    int $reducedTokens,
+    float $pricePerMillion = 0.50
+): array {
+    $originalCost = ($originalTokens / 1_000_000) * $pricePerMillion;
+    $reducedCost = ($reducedTokens / 1_000_000) * $pricePerMillion;
+    $savings = $originalCost - $reducedCost;
+    $savingsPercent = ($savings / max($originalCost, 0.000001)) * 100;
+
+    return [
+        'original_cost' => $originalCost,
+        'reduced_cost' => $reducedCost,
+        'savings' => $savings,
+        'savings_percent' => $savingsPercent,
+    ];
+}
+
+if ($totalOriginal > 0 && $totalReduced > 0) {
+    $savings = calculateCostSavings($totalOriginal, $totalReduced);
+
+    echo "Cost Analysis:\n";
+    echo str_repeat('=', 60) . "\n";
+    echo "Price: $0.50 per million tokens (example)\n\n";
+    echo sprintf("  Original cost: $%.6f\n", $savings['original_cost']);
+    echo sprintf("  Reduced cost:  $%.6f\n", $savings['reduced_cost']);
+    echo sprintf("  Savings:       $%.6f (%.1f%%)\n\n", $savings['savings'], $savings['savings_percent']);
+
+    $documentsPerDay = 100;
+    $daysPerYear = 365;
+    $annualSavings = $savings['savings'] * $documentsPerDay * $daysPerYear;
+
+    echo "Projected Annual Savings:\n";
+    echo "  Documents per day: $documentsPerDay\n";
+    echo "  Annual savings: $" . number_format($annualSavings, 2) . "\n";
+}
+```
--- a/docs/snippets/php/utils/vector_database_integration.php
+++ b/docs/snippets/php/utils/vector_database_integration.php
@@ -0,0 +1,284 @@
+```php title="vector_database_integration.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Vector Database Integration
+ *
+ * Extract documents with chunking and embeddings for vector database storage.
+ * Demonstrates preparing data for semantic search and RAG applications.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChars: 512,
+        maxOverlap: 50,
+        embedding: new EmbeddingConfig(
+            model: 'balanced',
+            normalize: true
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Vector Database Integration:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Document: document.pdf\n";
+echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
+
+$vectorRecords = [];
+
+foreach ($result->chunks ?? [] as $index => $chunk) {
+    if ($chunk->embedding === null) {
+        continue;
+    }
+
+    $chunkId = sprintf(
+        'doc_%s_chunk_%d',
+        md5('document.pdf'),
+        $index
+    );
+
+    $vectorRecords[] = [
+        'id' => $chunkId,
+        'content' => $chunk->content,
+        'embedding' => $chunk->embedding,
+        'metadata' => [
+            'source_file' => 'document.pdf',
+            'chunk_index' => $index,
+            'chunk_length' => strlen($chunk->content),
+            'embedding_model' => 'balanced',
+            'created_at' => date('c'),
+        ],
+    ];
+}
+
+echo "Prepared " . count($vectorRecords) . " records for vector database\n\n";
+
+if (!empty($vectorRecords)) {
+    echo "Sample Vector Record Structure:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $sample = $vectorRecords[0];
+    echo "ID: {$sample['id']}\n";
+    echo "Content preview: " . substr($sample['content'], 0, 100) . "...\n";
+    echo "Embedding dimensions: " . count($sample['embedding']) . "\n";
+    echo "Metadata keys: " . implode(', ', array_keys($sample['metadata'])) . "\n\n";
+}
+
+function insertIntoPinecone(array $records, string $namespace = 'default'): void
+{
+
+    echo "Inserting into Pinecone:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $batches = array_chunk($records, 100); 
+
+    foreach ($batches as $batchIndex => $batch) {
+        echo sprintf(
+            "Batch %d: Upserting %d vectors to namespace '%s'...\n",
+            $batchIndex + 1,
+            count($batch),
+            $namespace
+        );
+
+    }
+
+    echo "Completed inserting " . count($records) . " vectors\n\n";
+}
+
+function insertIntoWeaviate(array $records, string $className = 'Document'): void
+{
+
+    echo "Inserting into Weaviate:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    foreach ($records as $index => $record) {
+        $object = [
+            'class' => $className,
+            'properties' => [
+                'content' => $record['content'],
+                'sourceFile' => $record['metadata']['source_file'],
+                'chunkIndex' => $record['metadata']['chunk_index'],
+                'createdAt' => $record['metadata']['created_at'],
+            ],
+            'vector' => $record['embedding'],
+        ];
+
+
+        if (($index + 1) % 10 === 0) {
+            echo sprintf("Inserted %d/%d objects\n", $index + 1, count($records));
+        }
+    }
+
+    echo "Completed inserting " . count($records) . " objects\n\n";
+}
+
+function insertIntoQdrant(
+    array $records,
+    string $collectionName = 'documents'
+): void {
+
+    echo "Inserting into Qdrant:\n";
+    echo str_repeat('-', 40) . "\n";
+
+    $points = [];
+
+    foreach ($records as $record) {
+        $points[] = [
+            'id' => $record['id'],
+            'vector' => $record['embedding'],
+            'payload' => [
+                'content' => $record['content'],
+                'metadata' => $record['metadata'],
+            ],
+        ];
+    }
+
+    echo sprintf(
+        "Upserting %d points to collection '%s'...\n",
+        count($points),
+        $collectionName
+    );
+
+
+    echo "Completed\n\n";
+}
+
+echo "Vector Database Integration Examples:\n";
+echo str_repeat('=', 60) . "\n\n";
+
+insertIntoPinecone($vectorRecords, 'documents');
+
+insertIntoWeaviate($vectorRecords, 'DocumentChunk');
+
+insertIntoQdrant($vectorRecords, 'document_chunks');
+
+$documents = [
+    'doc1.pdf',
+    'doc2.pdf',
+    'doc3.pdf',
+];
+
+echo "Batch Processing for Vector Database:\n";
+echo str_repeat('=', 60) . "\n";
+
+$allVectorRecords = [];
+
+$vectorConfig = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChars: 512,
+        maxOverlap: 50,
+        embedding: new EmbeddingConfig(
+            model: 'balanced',
+            normalize: true
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($vectorConfig);
+
+foreach ($documents as $document) {
+    if (!file_exists($document)) {
+        echo basename($document) . ": File not found\n";
+        continue;
+    }
+
+    $result = $kreuzberg->extractFile($document);
+
+    echo basename($document) . ":\n";
+    echo "  Chunks: " . count($result->chunks ?? []) . "\n";
+
+    foreach ($result->chunks ?? [] as $index => $chunk) {
+        if ($chunk->embedding === null) {
+            continue;
+        }
+
+        $chunkId = sprintf(
+            'doc_%s_chunk_%d',
+            md5($document),
+            $index
+        );
+
+        $allVectorRecords[] = [
+            'id' => $chunkId,
+            'content' => $chunk->content,
+            'embedding' => $chunk->embedding,
+            'metadata' => [
+                'source_file' => basename($document),
+                'chunk_index' => $index,
+                'chunk_length' => strlen($chunk->content),
+                'embedding_model' => 'balanced',
+                'created_at' => date('c'),
+            ],
+        ];
+    }
+}
+
+echo "\nTotal records prepared: " . count($allVectorRecords) . "\n\n";
+
+function simulateSemanticSearch(string $query, array $records, int $topK = 5): array
+{
+
+    echo "Simulating semantic search:\n";
+    echo "  Query: \"$query\"\n";
+    echo "  Searching " . count($records) . " vectors...\n";
+    echo "  Top $topK results:\n\n";
+
+
+    $results = array_slice($records, 0, $topK);
+
+    foreach ($results as $index => $result) {
+        echo sprintf(
+            "  %d. %s (score: %.3f)\n",
+            $index + 1,
+            substr($result['content'], 0, 60) . '...',
+            0.9 - ($index * 0.05) 
+        );
+        echo sprintf("     Source: %s\n", $result['metadata']['source_file']);
+        echo "\n";
+    }
+
+    return $results;
+}
+
+if (!empty($allVectorRecords)) {
+    echo "Semantic Search Example:\n";
+    echo str_repeat('=', 60) . "\n";
+
+    simulateSemanticSearch(
+        "How to configure document extraction?",
+        $allVectorRecords,
+        3
+    );
+}
+
+function exportVectorRecordsToJson(array $records, string $filename): void
+{
+    $data = [
+        'version' => '1.0',
+        'count' => count($records),
+        'generated_at' => date('c'),
+        'records' => $records,
+    ];
+
+    $json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
+    file_put_contents($filename, $json);
+
+    echo "Exported " . count($records) . " vector records to: $filename\n";
+}
+
+if (!empty($allVectorRecords)) {
+    exportVectorRecordsToJson($allVectorRecords, 'vector_records.json');
+}
+```