Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/extraction/batch_processing.php
+++ b/docs/snippets/php/extraction/batch_processing.php
@@ -0,0 +1,154 @@
+```php title="batch_processing.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Batch Document Processing
+ *
+ * Process multiple documents in parallel for maximum performance.
+ * Kreuzberg's batch API uses multiple threads to extract documents concurrently.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use function Kreuzberg\batch_extract_files;
+use function Kreuzberg\batch_extract_bytes;
+
+$files = [
+    'document1.pdf',
+    'document2.docx',
+    'document3.xlsx',
+    'presentation.pptx',
+];
+
+$files = array_filter($files, 'file_exists');
+
+if (!empty($files)) {
+    echo "Processing " . count($files) . " files in batch...\n\n";
+
+    $start = microtime(true);
+    $results = batch_extract_files($files);
+    $elapsed = microtime(true) - $start;
+
+    echo "Batch extraction completed in " . number_format($elapsed, 3) . " seconds\n";
+    echo "Average: " . number_format($elapsed / count($files), 3) . " seconds per file\n\n";
+
+    foreach ($results as $index => $result) {
+        $filename = basename($files[$index]);
+        echo "$filename:\n";
+        echo "  Content: " . strlen($result->content) . " chars\n";
+        echo "  Tables: " . count($result->tables) . "\n";
+        echo "  MIME: " . $result->mimeType . "\n\n";
+    }
+}
+
+$config = new ExtractionConfig(
+    extractTables: true,
+    extractImages: false  
+);
+
+$kreuzberg = new Kreuzberg($config);
+
+$pdfFiles = glob('*.pdf');
+if (!empty($pdfFiles)) {
+    echo "Processing " . count($pdfFiles) . " PDF files...\n";
+
+    $start = microtime(true);
+    $results = $kreuzberg->batchExtractFiles($pdfFiles, $config);
+    $elapsed = microtime(true) - $start;
+
+    echo "Completed in " . number_format($elapsed, 2) . " seconds\n";
+    echo "Throughput: " . number_format(count($pdfFiles) / $elapsed, 2) . " files/second\n\n";
+
+    $totalChars = 0;
+    $totalTables = 0;
+
+    foreach ($results as $result) {
+        $totalChars += strlen($result->content);
+        $totalTables += count($result->tables);
+    }
+
+    echo "Total content: " . number_format($totalChars) . " characters\n";
+    echo "Total tables: $totalTables\n";
+}
+
+$uploadedFiles = [
+    ['data' => file_get_contents('file1.pdf'), 'mime' => 'application/pdf'],
+    ['data' => file_get_contents('file2.docx'), 'mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
+];
+
+$dataList = array_column($uploadedFiles, 'data');
+$mimeTypes = array_column($uploadedFiles, 'mime');
+
+$results = batch_extract_bytes($dataList, $mimeTypes);
+
+echo "\nProcessed " . count($results) . " files from memory\n";
+
+function processDirectory(string $dir, Kreuzberg $kreuzberg): array
+{
+    $results = [];
+    $iterator = new RecursiveIteratorIterator(
+        new RecursiveDirectoryIterator($dir)
+    );
+
+    $files = [];
+    foreach ($iterator as $file) {
+        if ($file->isFile()) {
+            $ext = strtolower($file->getExtension());
+            if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx', 'txt'], true)) {
+                $files[] = $file->getPathname();
+            }
+        }
+    }
+
+    if (empty($files)) {
+        return $results;
+    }
+
+    $batches = array_chunk($files, 10);
+
+    foreach ($batches as $batchIndex => $batch) {
+        echo "Processing batch " . ($batchIndex + 1) . "/" . count($batches) . "...\n";
+        $batchResults = $kreuzberg->batchExtractFiles($batch);
+        $results = array_merge($results, $batchResults);
+    }
+
+    return $results;
+}
+
+$directory = './documents';
+if (is_dir($directory)) {
+    echo "\nProcessing directory: $directory\n";
+    $results = processDirectory($directory, $kreuzberg);
+    echo "Processed " . count($results) . " files\n";
+}
+
+$mixedFiles = ['valid.pdf', 'nonexistent.pdf', 'another.docx'];
+
+try {
+    $results = batch_extract_files($mixedFiles);
+} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
+    echo "Batch processing error: " . $e->getMessage() . "\n";
+}
+
+$allFiles = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
+$batchSize = 5;
+$batches = array_chunk($allFiles, $batchSize);
+$totalProcessed = 0;
+
+echo "\nProcessing " . count($allFiles) . " files in " . count($batches) . " batches...\n";
+
+foreach ($batches as $index => $batch) {
+    $progress = (($index + 1) / count($batches)) * 100;
+    echo sprintf("\rProgress: %.1f%% [%d/%d batches]",
+        $progress, $index + 1, count($batches));
+
+    $results = $kreuzberg->batchExtractFiles($batch);
+    $totalProcessed += count($results);
+}
+
+echo "\n\nCompleted! Processed $totalProcessed files.\n";
+```
--- a/docs/snippets/php/extraction/docx_extraction.php
+++ b/docs/snippets/php/extraction/docx_extraction.php
@@ -0,0 +1,118 @@
+```php title="docx_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * DOCX (Word) Document Extraction
+ *
+ * Extract text, tables, and metadata from Microsoft Word documents.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use function Kreuzberg\extract_file;
+
+$result = extract_file('document.docx');
+
+echo "Word Document Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Content:\n";
+echo $result->content . "\n\n";
+
+echo "Document Metadata:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Title: " . ($result->metadata->title ?? 'N/A') . "\n";
+echo "Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
+echo "Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
+echo "Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n";
+echo "Subject: " . ($result->metadata->subject ?? 'N/A') . "\n";
+echo "Keywords: " . implode(', ', $result->metadata->keywords ?? []) . "\n\n";
+
+$config = new ExtractionConfig(
+    extractTables: true,
+    preserveFormatting: true
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('report.docx');
+
+foreach ($result->tables as $index => $table) {
+    echo "Table " . ($index + 1) . ":\n";
+    echo str_repeat('-', 60) . "\n";
+
+    foreach ($table->cells as $rowIndex => $row) {
+        echo implode(' | ', $row) . "\n";
+        if ($rowIndex === 0) {
+            echo str_repeat('-', 60) . "\n";
+        }
+    }
+    echo "\n";
+}
+
+$conversions = [
+    'plain' => null,
+    'markdown' => 'markdown',
+];
+
+foreach ($conversions as $name => $format) {
+    $config = new ExtractionConfig(
+        outputFormat: $format,
+        preserveFormatting: $format !== null
+    );
+
+    $kreuzberg = new Kreuzberg($config);
+    $result = $kreuzberg->extractFile('document.docx');
+
+    $outputFile = "output_$name.txt";
+    file_put_contents($outputFile, $result->content);
+    echo "Saved $name format to: $outputFile\n";
+}
+
+use function Kreuzberg\batch_extract_files;
+
+$docxFiles = glob('*.docx');
+if (!empty($docxFiles)) {
+    echo "\nBatch processing " . count($docxFiles) . " DOCX files...\n";
+
+    $results = batch_extract_files($docxFiles);
+
+    foreach ($results as $index => $result) {
+        $filename = basename($docxFiles[$index]);
+        echo "\n$filename:\n";
+        echo "  Characters: " . strlen($result->content) . "\n";
+        echo "  Tables: " . count($result->tables) . "\n";
+        echo "  Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown') . "\n";
+    }
+}
+
+$result = extract_file('reviewed_document.docx');
+
+if (!empty($result->metadata->createdBy)) {
+    echo "\nDocument Information:\n";
+    echo "Created by: " . $result->metadata->createdBy . "\n";
+}
+
+if (!empty($result->metadata->producer)) {
+    echo "Producer: " . $result->metadata->producer . "\n";
+}
+
+$result = extract_file('document.docx');
+$content = $result->content;
+
+$stats = [
+    'characters' => mb_strlen($content),
+    'words' => str_word_count($content),
+    'lines' => substr_count($content, "\n"),
+    'paragraphs' => substr_count($content, "\n\n"),
+    'sentences' => preg_match_all('/[.!?]+/', $content),
+];
+
+echo "\nDocument Statistics:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($stats as $metric => $value) {
+    echo ucfirst($metric) . ": " . number_format($value) . "\n";
+}
+```
--- a/docs/snippets/php/extraction/excel_extraction.php
+++ b/docs/snippets/php/extraction/excel_extraction.php
@@ -0,0 +1,288 @@
+```php title="excel_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Excel Spreadsheet Extraction
+ *
+ * This example demonstrates extracting content from Excel files (.xlsx, .xls).
+ * Excel spreadsheets are automatically converted to tables and text.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+
+echo "Example 1: Basic Excel Extraction\n";
+echo "=================================\n";
+
+$kreuzberg = new Kreuzberg();
+$result = $kreuzberg->extractFile('financial_report.xlsx');
+
+echo "Content:\n";
+echo $result->content . "\n\n";
+
+echo "Metadata:\n";
+echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
+echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
+echo "- Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
+echo "- Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n\n";
+
+echo "Example 2: Extract Excel Tables\n";
+echo "===============================\n";
+
+$config2 = new ExtractionConfig(
+    extractTables: true  
+);
+
+$result2 = (new Kreuzberg($config2))->extractFile('data.xlsx');
+
+if (count($result2->tables) > 0) {
+    echo "Found " . count($result2->tables) . " table(s)\n\n";
+
+    foreach ($result2->tables as $i => $table) {
+        echo "Table " . ($i + 1) . " (Sheet/Page {$table->pageNumber}):\n";
+        echo $table->markdown . "\n\n";
+
+        echo "Raw data:\n";
+        echo "Rows: " . count($table->cells) . "\n";
+        echo "Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n\n";
+    }
+}
+
+echo "Example 3: Convert Excel to CSV\n";
+echo "===============================\n";
+
+$result3 = $kreuzberg->extractFile('spreadsheet.xlsx');
+
+foreach ($result3->tables as $i => $table) {
+    $csvFilename = "sheet_{$i}.csv";
+    $fp = fopen($csvFilename, 'w');
+
+    foreach ($table->cells as $row) {
+        fputcsv($fp, $row);
+    }
+
+    fclose($fp);
+    echo "Saved: {$csvFilename}\n";
+}
+
+echo "\n";
+
+echo "Example 4: Convert Excel to JSON\n";
+echo "================================\n";
+
+$result4 = $kreuzberg->extractFile('data.xlsx');
+
+foreach ($result4->tables as $i => $table) {
+    $jsonData = [];
+
+    if (count($table->cells) > 0) {
+        $headers = $table->cells[0];
+
+        for ($j = 1; $j < count($table->cells); $j++) {
+            $row = $table->cells[$j];
+            $rowData = [];
+
+            for ($k = 0; $k < count($headers); $k++) {
+                $header = $headers[$k];
+                $value = $row[$k] ?? '';
+                $rowData[$header] = $value;
+            }
+
+            $jsonData[] = $rowData;
+        }
+    }
+
+    $jsonFilename = "sheet_{$i}.json";
+    file_put_contents($jsonFilename, json_encode($jsonData, JSON_PRETTY_PRINT));
+    echo "Saved: {$jsonFilename}\n";
+}
+
+echo "\n";
+
+echo "Example 5: Process Multiple Sheets\n";
+echo "==================================\n";
+
+$result5 = $kreuzberg->extractFile('multi_sheet_workbook.xlsx');
+
+echo "Total sheets/tables: " . count($result5->tables) . "\n\n";
+
+foreach ($result5->tables as $i => $table) {
+    echo "Sheet " . ($i + 1) . ":\n";
+    echo "- Rows: " . count($table->cells) . "\n";
+    echo "- Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n";
+
+    if (count($table->cells) > 1) {  
+        $numericColumns = [];
+
+        for ($col = 0; $col < count($table->cells[0]); $col++) {
+            $isNumeric = true;
+
+            for ($row = 1; $row < count($table->cells); $row++) {
+                $value = $table->cells[$row][$col] ?? '';
+                if (!is_numeric(trim($value)) && trim($value) !== '') {
+                    $isNumeric = false;
+                    break;
+                }
+            }
+
+            if ($isNumeric) {
+                $numericColumns[] = $col;
+            }
+        }
+
+        if (!empty($numericColumns)) {
+            echo "- Numeric columns: " . count($numericColumns) . "\n";
+
+            $col = $numericColumns[0];
+            $sum = 0;
+            for ($row = 1; $row < count($table->cells); $row++) {
+                $value = $table->cells[$row][$col] ?? '0';
+                $sum += (float) $value;
+            }
+
+            $columnName = $table->cells[0][$col] ?? "Column {$col}";
+            echo "- Sum of '{$columnName}': {$sum}\n";
+        }
+    }
+
+    echo "\n";
+}
+
+echo "Example 6: Extract Specific Data\n";
+echo "================================\n";
+
+$result6 = $kreuzberg->extractFile('budget.xlsx');
+
+if (count($result6->tables) > 0) {
+    $table = $result6->tables[0];
+
+    echo "Header row:\n";
+    if (count($table->cells) > 0) {
+        print_r($table->cells[0]);
+    }
+
+    echo "\nFirst data row:\n";
+    if (count($table->cells) > 1) {
+        print_r($table->cells[1]);
+    }
+
+    if (count($table->cells) > 1 && count($table->cells[1]) > 2) {
+        $cellValue = $table->cells[1][2];  
+        echo "\nCell [1][2]: {$cellValue}\n";
+    }
+}
+
+echo "\n";
+
+echo "Example 7: Batch Process Excel Files\n";
+echo "====================================\n";
+
+$excelFiles = [
+    'january_sales.xlsx',
+    'february_sales.xlsx',
+    'march_sales.xlsx',
+];
+
+$results = $kreuzberg->batchExtractFiles($excelFiles);
+
+$totalSheets = 0;
+foreach ($results as $i => $result) {
+    $sheetCount = count($result->tables);
+    $totalSheets += $sheetCount;
+
+    echo "{$excelFiles[$i]}:\n";
+    echo "- Sheets: {$sheetCount}\n";
+    echo "- Text length: " . strlen($result->content) . " characters\n\n";
+}
+
+echo "Total sheets across all files: {$totalSheets}\n\n";
+
+echo "Example 8: Convert Excel to HTML\n";
+echo "================================\n";
+
+$result8 = $kreuzberg->extractFile('report.xlsx');
+
+foreach ($result8->tables as $i => $table) {
+    $html = "<table border='1'>\n";
+
+    foreach ($table->cells as $rowIndex => $row) {
+        $html .= "  <tr>\n";
+
+        $tag = $rowIndex === 0 ? 'th' : 'td';  
+
+        foreach ($row as $cell) {
+            $escapedCell = htmlspecialchars($cell);
+            $html .= "    <{$tag}>{$escapedCell}</{$tag}>\n";
+        }
+
+        $html .= "  </tr>\n";
+    }
+
+    $html .= "</table>\n";
+
+    $htmlFilename = "sheet_{$i}.html";
+    file_put_contents($htmlFilename, $html);
+    echo "Saved: {$htmlFilename}\n";
+}
+
+echo "\n";
+
+echo "Example 9: Excel Metadata Extraction\n";
+echo "====================================\n";
+
+$result9 = $kreuzberg->extractFile('workbook.xlsx');
+
+echo "File Metadata:\n";
+echo "- Title: " . ($result9->metadata->title ?? 'N/A') . "\n";
+echo "- Subject: " . ($result9->metadata->subject ?? 'N/A') . "\n";
+echo "- Authors: " . (isset($result9->metadata->authors) ? implode(', ', $result9->metadata->authors) : 'N/A') . "\n";
+echo "- Created: " . ($result9->metadata->createdAt ?? 'N/A') . "\n";
+echo "- Modified: " . ($result9->metadata->modifiedAt ?? 'N/A') . "\n";
+echo "- Created By: " . ($result9->metadata->createdBy ?? 'N/A') . "\n";
+echo "- Keywords: " . (isset($result9->metadata->keywords) ? implode(', ', $result9->metadata->keywords) : 'N/A') . "\n";
+
+if (!empty($result9->metadata->custom)) {
+    echo "\nCustom Properties:\n";
+    foreach ($result9->metadata->custom as $key => $value) {
+        echo "- {$key}: {$value}\n";
+    }
+}
+
+echo "\n";
+
+echo "Example 10: Error Handling\n";
+echo "=========================\n";
+
+use Kreuzberg\Exceptions\KreuzbergException;
+
+try {
+    $result = $kreuzberg->extractFile('protected.xlsx');
+    echo "Success: Extracted " . count($result->tables) . " sheets\n";
+} catch (KreuzbergException $e) {
+    echo "Error: {$e->getMessage()}\n";
+    echo "Note: Password-protected files may require special handling\n";
+}
+
+echo "\n\nSupported Excel Formats:\n";
+echo "========================\n";
+echo "- .xlsx (Office Open XML)\n";
+echo "- .xls (Legacy Excel format)\n";
+echo "- .xlsm (Macro-enabled)\n";
+echo "- .xlsb (Binary workbook)\n";
+echo "- .xltx (Template)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "1. Excel tables are automatically detected as Table objects\n";
+echo "2. Each sheet becomes a separate table\n";
+echo "3. Use table->cells for programmatic access to cell data\n";
+echo "4. Use table->markdown for human-readable output\n";
+echo "5. First row is often headers - handle accordingly\n";
+echo "6. Check for numeric columns to perform calculations\n";
+echo "7. Export to CSV/JSON for database import\n";
+echo "8. Use batch processing for multiple Excel files\n";
+```
--- a/docs/snippets/php/extraction/image_extraction.php
+++ b/docs/snippets/php/extraction/image_extraction.php
@@ -0,0 +1,159 @@
+```php title="image_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Image Extraction from Documents
+ *
+ * Extract embedded images from PDFs, Office documents, and other formats.
+ * Optionally perform OCR on extracted images.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ImageExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+
+$config = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 100,      
+        minHeight: 100
+    ),
+    extractImages: true
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('presentation.pptx');
+
+echo "Image Extraction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Images found: " . count($result->images ?? []) . "\n\n";
+
+foreach ($result->images ?? [] as $image) {
+    $filename = sprintf(
+        'extracted_p%d_i%d_%dx%d.%s',
+        $image->pageNumber,
+        $image->imageIndex,
+        $image->width,
+        $image->height,
+        $image->format
+    );
+
+    file_put_contents($filename, $image->data);
+    echo "Saved: $filename\n";
+    echo "  Size: {$image->width}x{$image->height} pixels\n";
+    echo "  Format: {$image->format}\n";
+    echo "  Data: " . number_format(strlen($image->data)) . " bytes\n\n";
+}
+
+$ocrConfig = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        performOcr: true,
+        minWidth: 200,
+        minHeight: 100
+    ),
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng'
+    )
+);
+
+$kreuzberg = new Kreuzberg($ocrConfig);
+$result = $kreuzberg->extractFile('scanned_images.pdf');
+
+echo "Images with OCR:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($result->images ?? [] as $image) {
+    echo "Image {$image->imageIndex} from page {$image->pageNumber}:\n";
+
+    if ($image->ocrResult !== null) {
+        echo "  OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
+        echo "  OCR Length: " . strlen($image->ocrResult->content) . " chars\n";
+    } else {
+        echo "  No OCR result\n";
+    }
+    echo "\n";
+}
+
+$largeImageConfig = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 500,      
+        minHeight: 500
+    ),
+    extractImages: true
+);
+
+$kreuzberg = new Kreuzberg($largeImageConfig);
+$result = $kreuzberg->extractFile('photo_album.pdf');
+
+echo "Large images (>500x500):\n";
+foreach ($result->images ?? [] as $image) {
+    $filename = "large_image_{$image->imageIndex}.{$image->format}";
+    file_put_contents($filename, $image->data);
+    echo "Saved: $filename ({$image->width}x{$image->height})\n";
+}
+
+$result = $kreuzberg->extractFile('document.pdf');
+
+$imageTypes = [];
+foreach ($result->images ?? [] as $image) {
+    if (!isset($imageTypes[$image->format])) {
+        $imageTypes[$image->format] = [];
+    }
+    $imageTypes[$image->format][] = $image;
+}
+
+echo "\nImages by format:\n";
+foreach ($imageTypes as $format => $images) {
+    echo "  $format: " . count($images) . " images\n";
+
+    $dir = "images_$format";
+    if (!is_dir($dir)) {
+        mkdir($dir, 0755, true);
+    }
+
+    foreach ($images as $index => $image) {
+        $filename = "$dir/image_$index.$format";
+        file_put_contents($filename, $image->data);
+    }
+    echo "    Saved to: $dir/\n";
+}
+
+if (extension_loaded('gd')) {
+    foreach ($result->images ?? [] as $image) {
+        if ($image->format === 'png' || $image->format === 'jpg') {
+            $gdImage = imagecreatefromstring($image->data);
+
+            if ($gdImage !== false) {
+                $width = imagesx($gdImage);
+                $height = imagesy($gdImage);
+                $thumbWidth = 200;
+                $thumbHeight = (int)(($height / $width) * $thumbWidth);
+
+                $thumb = imagecreatetruecolor($thumbWidth, $thumbHeight);
+                imagecopyresampled($thumb, $gdImage, 0, 0, 0, 0,
+                    $thumbWidth, $thumbHeight, $width, $height);
+
+                $thumbFile = "thumb_{$image->imageIndex}.{$image->format}";
+                if ($image->format === 'png') {
+                    imagepng($thumb, $thumbFile);
+                } else {
+                    imagejpeg($thumb, $thumbFile, 85);
+                }
+
+                echo "Created thumbnail: $thumbFile\n";
+
+                imagedestroy($gdImage);
+                imagedestroy($thumb);
+            }
+        }
+    }
+}
+```
--- a/docs/snippets/php/extraction/metadata_extraction.php
+++ b/docs/snippets/php/extraction/metadata_extraction.php
@@ -0,0 +1,196 @@
+```php title="metadata_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Metadata Extraction
+ *
+ * Extract and process document metadata including title, author,
+ * creation date, keywords, and custom properties.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use function Kreuzberg\extract_file;
+
+$result = extract_file('document.pdf');
+$metadata = $result->metadata;
+
+echo "Document Metadata:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Title: " . ($metadata->title ?? 'N/A') . "\n";
+echo "Authors: " . (isset($metadata->authors) ? implode(', ', $metadata->authors) : 'N/A') . "\n";
+echo "Subject: " . ($metadata->subject ?? 'N/A') . "\n";
+echo "Created By: " . ($metadata->createdBy ?? 'N/A') . "\n";
+echo "Producer: " . ($metadata->producer ?? 'N/A') . "\n";
+echo "Created: " . ($metadata->createdAt ?? 'N/A') . "\n";
+echo "Modified: " . ($metadata->modifiedAt ?? 'N/A') . "\n";
+echo "Page Count: " . ($metadata->pageCount ?? 'N/A') . "\n";
+echo "Keywords: " . implode(', ', $metadata->keywords ?? []) . "\n";
+echo "Language: " . ($metadata->language ?? 'N/A') . "\n\n";
+
+$files = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
+$metadataCollection = [];
+
+foreach ($files as $file) {
+    $result = extract_file($file);
+    $metadataCollection[] = [
+        'file' => basename($file),
+        'title' => $result->metadata->title ?? 'Untitled',
+        'author' => isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown',
+        'created' => $result->metadata->createdAt ?? 'Unknown',
+        'pages' => $result->metadata->pageCount ?? 0,
+        'size' => filesize($file),
+    ];
+}
+
+echo "Metadata Collection:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($metadataCollection as $meta) {
+    echo "{$meta['file']}:\n";
+    echo "  Title: {$meta['title']}\n";
+    echo "  Author: {$meta['author']}\n";
+    echo "  Created: {$meta['created']}\n";
+    echo "  Pages: {$meta['pages']}\n";
+    echo "  Size: " . number_format($meta['size'] / 1024, 2) . " KB\n\n";
+}
+
+function searchByAuthor(array $collection, string $author): array
+{
+    return array_filter($collection, function ($meta) use ($author) {
+        return stripos($meta['author'], $author) !== false;
+    });
+}
+
+function searchByDateRange(array $collection, string $start, string $end): array
+{
+    return array_filter($collection, function ($meta) use ($start, $end) {
+        $created = $meta['created'];
+        if ($created === 'Unknown') {
+            return false;
+        }
+        $dateOnly = substr($created, 0, 10);
+        return $dateOnly >= $start && $dateOnly <= $end;
+    });
+}
+
+$johnDocs = searchByAuthor($metadataCollection, 'John');
+echo "Documents by John: " . count($johnDocs) . "\n";
+
+$recentDocs = searchByDateRange($metadataCollection, '2024-01-01', '2024-12-31');
+echo "Documents from 2024: " . count($recentDocs) . "\n\n";
+
+function generateCatalog(array $collection): string
+{
+    $html = "<html><head><title>Document Catalog</title></head><body>\n";
+    $html .= "<h1>Document Catalog</h1>\n";
+    $html .= "<table border='1'>\n";
+    $html .= "<tr><th>File</th><th>Title</th><th>Author</th><th>Created</th><th>Pages</th></tr>\n";
+
+    foreach ($collection as $meta) {
+        $html .= "<tr>";
+        $html .= "<td>" . htmlspecialchars($meta['file']) . "</td>";
+        $html .= "<td>" . htmlspecialchars($meta['title']) . "</td>";
+        $html .= "<td>" . htmlspecialchars($meta['author']) . "</td>";
+        $html .= "<td>" . htmlspecialchars($meta['created']) . "</td>";
+        $html .= "<td>" . htmlspecialchars((string)$meta['pages']) . "</td>";
+        $html .= "</tr>\n";
+    }
+
+    $html .= "</table>\n</body></html>";
+    return $html;
+}
+
+$catalog = generateCatalog($metadataCollection);
+file_put_contents('catalog.html', $catalog);
+echo "Catalog saved to: catalog.html\n";
+
+function exportMetadataToCSV(array $collection, string $filename): void
+{
+    $fp = fopen($filename, 'w');
+
+    fputcsv($fp, ['File', 'Title', 'Author', 'Created', 'Pages', 'Size (KB)']);
+
+    foreach ($collection as $meta) {
+        fputcsv($fp, [
+            $meta['file'],
+            $meta['title'],
+            $meta['author'],
+            $meta['created'],
+            $meta['pages'],
+            number_format($meta['size'] / 1024, 2),
+        ]);
+    }
+
+    fclose($fp);
+}
+
+exportMetadataToCSV($metadataCollection, 'metadata.csv');
+echo "Metadata exported to: metadata.csv\n";
+
+$totalPages = array_sum(array_column($metadataCollection, 'pages'));
+$totalSize = array_sum(array_column($metadataCollection, 'size'));
+$authors = array_unique(array_column($metadataCollection, 'author'));
+
+echo "\nCollection Statistics:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Total documents: " . count($metadataCollection) . "\n";
+echo "Total pages: " . number_format($totalPages) . "\n";
+echo "Total size: " . number_format($totalSize / 1024 / 1024, 2) . " MB\n";
+echo "Unique authors: " . count($authors) . "\n";
+echo "Average pages per document: " . number_format($totalPages / count($metadataCollection), 1) . "\n";
+
+$byAuthor = [];
+foreach ($metadataCollection as $meta) {
+    $author = $meta['author'];
+    if (!isset($byAuthor[$author])) {
+        $byAuthor[$author] = [];
+    }
+    $byAuthor[$author][] = $meta;
+}
+
+echo "\nDocuments by Author:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($byAuthor as $author => $docs) {
+    echo "$author: " . count($docs) . " documents\n";
+}
+
+function validateMetadata(array $meta): array
+{
+    $issues = [];
+
+    if (empty($meta['title']) || $meta['title'] === 'Untitled') {
+        $issues[] = 'Missing title';
+    }
+
+    if (empty($meta['author']) || $meta['author'] === 'Unknown') {
+        $issues[] = 'Missing author';
+    }
+
+    if (empty($meta['created']) || $meta['created'] === 'Unknown') {
+        $issues[] = 'Missing creation date';
+    }
+
+    if ($meta['pages'] === 0) {
+        $issues[] = 'Invalid page count';
+    }
+
+    return $issues;
+}
+
+echo "\nMetadata Quality Check:\n";
+echo str_repeat('=', 60) . "\n";
+
+$incomplete = 0;
+foreach ($metadataCollection as $meta) {
+    $issues = validateMetadata($meta);
+    if (!empty($issues)) {
+        $incomplete++;
+        echo "{$meta['file']}: " . implode(', ', $issues) . "\n";
+    }
+}
+
+echo "\nIncomplete metadata: $incomplete/" . count($metadataCollection) . " documents\n";
+```
--- a/docs/snippets/php/extraction/multi_format.php
+++ b/docs/snippets/php/extraction/multi_format.php
@@ -0,0 +1,282 @@
+```php title="multi_format.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Multi-Format Document Extraction
+ *
+ * Handle various document formats (PDF, DOCX, XLSX, PPTX, images, etc.)
+ * with format-specific processing and unified output.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use function Kreuzberg\extract_file;
+use function Kreuzberg\detect_mime_type_from_path;
+
+$formats = [
+    'PDF' => 'document.pdf',
+    'Word' => 'document.docx',
+    'Excel' => 'spreadsheet.xlsx',
+    'PowerPoint' => 'presentation.pptx',
+    'Text' => 'readme.txt',
+    'HTML' => 'page.html',
+    'Markdown' => 'guide.md',
+    'Image' => 'scan.png',
+];
+
+echo "Multi-Format Extraction:\n";
+echo str_repeat('=', 60) . "\n\n";
+
+$kreuzberg = new Kreuzberg();
+
+foreach ($formats as $type => $file) {
+    if (!file_exists($file)) {
+        continue;
+    }
+
+    echo "Processing $type ($file):\n";
+
+    $mimeType = detect_mime_type_from_path($file);
+    echo "  MIME type: $mimeType\n";
+
+    $result = $kreuzberg->extractFile($file);
+
+    echo "  Content length: " . strlen($result->content) . " chars\n";
+    echo "  Tables: " . count($result->tables) . "\n";
+    echo "  Images: " . count($result->images ?? []) . "\n";
+    echo "  Pages: " . ($result->metadata->pageCount ?? 'N/A') . "\n";
+    echo "\n";
+}
+
+$mixedFiles = glob('documents/*.*');
+$byFormat = [];
+
+foreach ($mixedFiles as $file) {
+    $mimeType = detect_mime_type_from_path($file);
+    $extension = pathinfo($file, PATHINFO_EXTENSION);
+
+    if (!isset($byFormat[$extension])) {
+        $byFormat[$extension] = [];
+    }
+
+    $result = extract_file($file);
+    $byFormat[$extension][] = [
+        'file' => basename($file),
+        'mime' => $mimeType,
+        'size' => strlen($result->content),
+        'tables' => count($result->tables),
+    ];
+}
+
+echo "Files by Format:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($byFormat as $ext => $files) {
+    echo strtoupper($ext) . ": " . count($files) . " files\n";
+
+    $totalSize = array_sum(array_column($files, 'size'));
+    $totalTables = array_sum(array_column($files, 'tables'));
+
+    echo "  Total content: " . number_format($totalSize) . " chars\n";
+    echo "  Total tables: $totalTables\n\n";
+}
+
+$formatConfigs = [
+    'pdf' => new ExtractionConfig(
+        extractTables: true,
+        extractImages: true,
+        pdf: new \Kreuzberg\Config\PdfConfig(
+            extractImages: true,
+            imageQuality: 85
+        )
+    ),
+    'docx' => new ExtractionConfig(
+        extractTables: true,
+        preserveFormatting: true
+    ),
+    'xlsx' => new ExtractionConfig(
+        extractTables: true  
+    ),
+    'png' => new ExtractionConfig(
+        ocr: new \Kreuzberg\Config\OcrConfig(
+            backend: 'tesseract',
+            language: 'eng'
+        )
+    ),
+];
+
+foreach ($mixedFiles as $file) {
+    $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
+
+    if (!isset($formatConfigs[$ext])) {
+        continue;
+    }
+
+    $config = $formatConfigs[$ext];
+    $kreuzberg = new Kreuzberg($config);
+    $result = $kreuzberg->extractFile($file);
+
+    echo "Processed " . basename($file) . " with $ext config\n";
+}
+
+function convertToMarkdown(string $inputFile): string
+{
+    $config = new ExtractionConfig(
+        preserveFormatting: true,
+        outputFormat: 'markdown',
+        extractTables: true
+    );
+
+    $kreuzberg = new Kreuzberg($config);
+    $result = $kreuzberg->extractFile($inputFile);
+
+    $markdown = "# " . ($result->metadata->title ?? basename($inputFile)) . "\n\n";
+
+    if (isset($result->metadata->authors)) {
+        $markdown .= "_Authors: " . implode(', ', $result->metadata->authors) . "_\n\n";
+    }
+
+    $markdown .= $result->content . "\n\n";
+
+    foreach ($result->tables as $index => $table) {
+        $markdown .= "## Table " . ($index + 1) . "\n\n";
+        $markdown .= $table->markdown . "\n\n";
+    }
+
+    return $markdown;
+}
+
+echo "\nConverting to Markdown:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach (['document.pdf', 'document.docx'] as $file) {
+    if (!file_exists($file)) {
+        continue;
+    }
+
+    $markdown = convertToMarkdown($file);
+    $outputFile = pathinfo($file, PATHINFO_FILENAME) . '.md';
+
+    file_put_contents($outputFile, $markdown);
+    echo "Converted: $file -> $outputFile\n";
+}
+
+function extractFromArchive(string $archiveFile): array
+{
+    $result = extract_file($archiveFile);
+
+    return [
+        'archive' => basename($archiveFile),
+        'listing' => $result->content,
+        'mime' => $result->mimeType,
+    ];
+}
+
+class UniversalExtractor
+{
+    private Kreuzberg $kreuzberg;
+    private array $formatHandlers = [];
+
+    public function __construct()
+    {
+        $this->kreuzberg = new Kreuzberg();
+
+        $this->formatHandlers = [
+            'application/pdf' => [$this, 'handlePDF'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => [$this, 'handleDOCX'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => [$this, 'handleXLSX'],
+            'image/png' => [$this, 'handleImage'],
+            'image/jpeg' => [$this, 'handleImage'],
+        ];
+    }
+
+    public function extract(string $file): array
+    {
+        $mimeType = detect_mime_type_from_path($file);
+        $handler = $this->formatHandlers[$mimeType] ?? [$this, 'handleGeneric'];
+
+        return $handler($file, $mimeType);
+    }
+
+    private function handlePDF(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(extractTables: true, extractImages: true);
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'PDF',
+            'content' => $result->content,
+            'tables' => count($result->tables),
+            'images' => count($result->images ?? []),
+            'pages' => $result->metadata->pageCount,
+        ];
+    }
+
+    private function handleDOCX(string $file, string $mimeType): array
+    {
+        $result = $this->kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Word Document',
+            'content' => $result->content,
+            'tables' => count($result->tables),
+            'authors' => $result->metadata->authors,
+        ];
+    }
+
+    private function handleXLSX(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(extractTables: true);
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Excel Spreadsheet',
+            'content' => $result->content,
+            'sheets' => count($result->tables),  
+        ];
+    }
+
+    private function handleImage(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(
+            ocr: new \Kreuzberg\Config\OcrConfig(backend: 'tesseract', language: 'eng')
+        );
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Image (OCR)',
+            'content' => $result->content,
+            'ocr_length' => strlen($result->content),
+        ];
+    }
+
+    private function handleGeneric(string $file, string $mimeType): array
+    {
+        $result = $this->kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Generic',
+            'mime' => $mimeType,
+            'content' => $result->content,
+        ];
+    }
+}
+
+$extractor = new UniversalExtractor();
+
+echo "\nUniversal Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($mixedFiles as $file) {
+    $data = $extractor->extract($file);
+    echo basename($file) . " ({$data['type']}):\n";
+    print_r(array_filter($data, fn($k) => $k !== 'content', ARRAY_FILTER_USE_KEY));
+    echo "\n";
+}
+```
--- a/docs/snippets/php/extraction/pdf_extraction.php
+++ b/docs/snippets/php/extraction/pdf_extraction.php
@@ -0,0 +1,114 @@
+```php title="pdf_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PDF Document Extraction
+ *
+ * Extract text, tables, and images from PDF files with various configurations.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PdfConfig;
+
+$kreuzberg = new Kreuzberg();
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "PDF Extraction Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Content length: " . strlen($result->content) . " characters\n";
+echo "Tables found: " . count($result->tables) . "\n";
+echo "Pages: " . ($result->metadata->pageCount ?? 'unknown') . "\n\n";
+
+$config = new ExtractionConfig(
+    extractImages: true,
+    extractTables: true,
+    pdf: new PdfConfig(
+        extractImages: true,
+        imageQuality: 85
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('report.pdf');
+
+echo "Extracted Tables:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($result->tables as $index => $table) {
+    echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
+    echo "Rows: " . count($table->cells) . "\n";
+    echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
+
+    echo "Markdown format:\n";
+    echo $table->markdown . "\n\n";
+
+    $csvFile = "table_{$index}.csv";
+    $fp = fopen($csvFile, 'w');
+    foreach ($table->cells as $row) {
+        fputcsv($fp, $row);
+    }
+    fclose($fp);
+    echo "Saved to: $csvFile\n\n";
+}
+
+echo "Extracted Images:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($result->images ?? [] as $image) {
+    $filename = sprintf(
+        'page_%d_image_%d.%s',
+        $image->pageNumber,
+        $image->imageIndex,
+        $image->format
+    );
+
+    file_put_contents($filename, $image->data);
+    echo "Saved: $filename\n";
+    echo "  Size: {$image->width}x{$image->height}\n";
+    echo "  Format: {$image->format}\n";
+    echo "  Data size: " . strlen($image->data) . " bytes\n\n";
+}
+
+$formattedConfig = new ExtractionConfig(
+    preserveFormatting: true,
+    outputFormat: 'markdown'
+);
+
+$kreuzberg = new Kreuzberg($formattedConfig);
+$result = $kreuzberg->extractFile('formatted.pdf');
+
+file_put_contents('output.md', $result->content);
+echo "Saved formatted output to: output.md\n";
+
+$result = $kreuzberg->extractFile('document.pdf');
+$content = $result->content;
+
+$sections = [];
+$lines = explode("\n", $content);
+$currentSection = null;
+$currentContent = [];
+
+foreach ($lines as $line) {
+    if (preg_match('/^#+\s+(.+)$/', $line, $matches)) {
+        if ($currentSection !== null) {
+            $sections[$currentSection] = implode("\n", $currentContent);
+        }
+        $currentSection = $matches[1];
+        $currentContent = [];
+    } else {
+        $currentContent[] = $line;
+    }
+}
+
+if ($currentSection !== null) {
+    $sections[$currentSection] = implode("\n", $currentContent);
+}
+
+echo "\nDocument sections:\n";
+foreach ($sections as $title => $content) {
+    echo "  - $title (" . strlen($content) . " chars)\n";
+}
+```
--- a/docs/snippets/php/extraction/powerpoint_extraction.php
+++ b/docs/snippets/php/extraction/powerpoint_extraction.php
@@ -0,0 +1,195 @@
+```php title="powerpoint_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PowerPoint Presentation Extraction
+ *
+ * This example demonstrates extracting content from PowerPoint files (.pptx, .ppt),
+ * including text, notes, images, and tables from slides.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ImageExtractionConfig;
+use Kreuzberg\Config\PageConfig;
+
+echo "Example 1: Basic PowerPoint Extraction\n";
+echo "======================================\n";
+
+$kreuzberg = new Kreuzberg();
+$result = $kreuzberg->extractFile('presentation.pptx');
+
+echo "Content:\n";
+echo $result->content . "\n\n";
+
+echo "Metadata:\n";
+echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
+echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
+echo "- Slide Count: " . ($result->metadata->pageCount ?? 'N/A') . "\n\n";
+
+echo "Example 2: Extract Per-Slide Content\n";
+echo "====================================\n";
+
+$config2 = new ExtractionConfig(
+    page: new PageConfig(
+        extractPages: true,           
+        insertPageMarkers: true,      
+        markerFormat: '--- Slide {page_number} ---'
+    )
+);
+
+$result2 = (new Kreuzberg($config2))->extractFile('presentation.pptx');
+
+if ($result2->pages !== null) {
+    echo "Total slides: " . count($result2->pages) . "\n\n";
+
+    foreach ($result2->pages as $page) {
+        echo "Slide {$page->pageNumber}:\n";
+        echo "- Text length: " . strlen($page->content) . " characters\n";
+        echo "- Tables: " . count($page->tables) . "\n";
+        echo "- Images: " . count($page->images) . "\n";
+        echo "- Content preview: " . substr($page->content, 0, 100) . "...\n\n";
+    }
+}
+
+echo "Example 3: Extract Images from Slides\n";
+echo "=====================================\n";
+
+$config3 = new ExtractionConfig(
+    imageExtraction: new ImageExtractionConfig(
+        extractImages: true,
+        minWidth: 100,
+        minHeight: 100
+    )
+);
+
+$result3 = (new Kreuzberg($config3))->extractFile('presentation.pptx');
+
+if ($result3->images !== null) {
+    echo "Total images: " . count($result3->images) . "\n\n";
+
+    foreach ($result3->images as $i => $image) {
+        echo "Image {$i}:\n";
+        echo "- Format: {$image->format}\n";
+        echo "- Size: {$image->width}x{$image->height}\n";
+        echo "- Slide: {$image->pageNumber}\n";
+
+        $filename = "slide_{$image->pageNumber}_image_{$i}.{$image->format}";
+        file_put_contents($filename, base64_decode($image->data));
+        echo "- Saved: {$filename}\n\n";
+    }
+}
+
+echo "Example 4: Extract Tables from Slides\n";
+echo "=====================================\n";
+
+$config4 = new ExtractionConfig(
+    extractTables: true
+);
+
+$result4 = (new Kreuzberg($config4))->extractFile('data_presentation.pptx');
+
+if (count($result4->tables) > 0) {
+    echo "Found " . count($result4->tables) . " table(s)\n\n";
+
+    foreach ($result4->tables as $i => $table) {
+        echo "Table " . ($i + 1) . " (Slide {$table->pageNumber}):\n";
+        echo $table->markdown . "\n\n";
+    }
+}
+
+echo "Example 5: Convert PowerPoint to Markdown\n";
+echo "=========================================\n";
+
+$config5 = new ExtractionConfig(
+    page: new PageConfig(
+        extractPages: true,
+        insertPageMarkers: true,
+        markerFormat: '---\n\n## Slide {page_number}\n\n'
+    ),
+    outputFormat: 'markdown'
+);
+
+$result5 = (new Kreuzberg($config5))->extractFile('presentation.pptx');
+
+$markdownContent = $result5->content;
+file_put_contents('presentation.md', $markdownContent);
+
+echo "Converted to Markdown\n";
+echo "Saved as: presentation.md\n";
+echo "Content preview:\n";
+echo substr($markdownContent, 0, 500) . "...\n\n";
+
+echo "Example 6: Generate Presentation Summary\n";
+echo "========================================\n";
+
+$config6 = new ExtractionConfig(
+    page: new PageConfig(extractPages: true)
+);
+
+$result6 = (new Kreuzberg($config6))->extractFile('meeting_deck.pptx');
+
+echo "Presentation Summary:\n";
+echo "====================\n";
+echo "Title: " . ($result6->metadata->title ?? 'Untitled') . "\n";
+echo "Author: " . (isset($result6->metadata->authors) ? implode(', ', $result6->metadata->authors) : 'Unknown') . "\n";
+echo "Total Slides: " . ($result6->metadata->pageCount ?? count($result6->pages ?? [])) . "\n";
+echo "Total Text: " . strlen($result6->content) . " characters\n";
+echo "Tables: " . count($result6->tables) . "\n";
+
+if ($result6->pages !== null) {
+    echo "\nSlide Breakdown:\n";
+    foreach ($result6->pages as $page) {
+        $wordCount = str_word_count($page->content);
+        echo "- Slide {$page->pageNumber}: {$wordCount} words, " . count($page->tables) . " tables\n";
+    }
+}
+
+echo "\n";
+
+echo "Example 7: Search Content in Slides\n";
+echo "===================================\n";
+
+$config7 = new ExtractionConfig(
+    page: new PageConfig(extractPages: true)
+);
+
+$result7 = (new Kreuzberg($config7))->extractFile('presentation.pptx');
+
+$searchTerm = "revenue";
+
+if ($result7->pages !== null) {
+    echo "Searching for '{$searchTerm}':\n\n";
+
+    foreach ($result7->pages as $page) {
+        if (stripos($page->content, $searchTerm) !== false) {
+            echo "Found in Slide {$page->pageNumber}:\n";
+
+            $pos = stripos($page->content, $searchTerm);
+            $context = substr($page->content, max(0, $pos - 50), 150);
+            echo "- Context: ...{$context}...\n\n";
+        }
+    }
+}
+
+echo "\nSupported PowerPoint Formats:\n";
+echo "=============================\n";
+echo "- .pptx (PowerPoint 2007+)\n";
+echo "- .ppt (PowerPoint 97-2003)\n";
+echo "- .pptm (Macro-enabled)\n";
+echo "- .potx (Template)\n";
+
+echo "\n\nBest Practices:\n";
+echo "===============\n";
+echo "1. Use page extraction to process individual slides\n";
+echo "2. Extract images for visual content analysis\n";
+echo "3. Extract tables for data analysis\n";
+echo "4. Use metadata for presentation information\n";
+echo "5. Convert to Markdown for documentation\n";
+echo "6. Search across slides for specific content\n";
+echo "7. Generate summaries for presentation overviews\n";
+```
--- a/docs/snippets/php/extraction/table_extraction.php
+++ b/docs/snippets/php/extraction/table_extraction.php
@@ -0,0 +1,217 @@
+```php title="table_extraction.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Table Extraction and Processing
+ *
+ * Extract tables from PDFs and other documents, process them,
+ * and export to various formats (CSV, JSON, HTML).
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\OcrConfig;
+use Kreuzberg\Config\TesseractConfig;
+
+$config = new ExtractionConfig(extractTables: true);
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('financial_report.pdf');
+
+echo "Table Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Tables found: " . count($result->tables) . "\n\n";
+
+foreach ($result->tables as $index => $table) {
+    echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
+    echo str_repeat('-', 60) . "\n";
+
+    echo "Markdown:\n";
+    echo $table->markdown . "\n\n";
+
+    echo "Array format:\n";
+    echo "Rows: " . count($table->cells) . "\n";
+    echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
+
+    echo "HTML:\n";
+    echo "<table>\n";
+    foreach ($table->cells as $rowIndex => $row) {
+        $tag = $rowIndex === 0 ? 'th' : 'td';
+        echo "  <tr>\n";
+        foreach ($row as $cell) {
+            echo "    <$tag>" . htmlspecialchars($cell) . "</$tag>\n";
+        }
+        echo "  </tr>\n";
+    }
+    echo "</table>\n\n";
+}
+
+foreach ($result->tables as $index => $table) {
+    $filename = "table_" . ($index + 1) . "_page_" . $table->pageNumber . ".csv";
+    $fp = fopen($filename, 'w');
+
+    foreach ($table->cells as $row) {
+        fputcsv($fp, $row);
+    }
+
+    fclose($fp);
+    echo "Exported to: $filename\n";
+}
+echo "\n";
+
+$ocrConfig = new ExtractionConfig(
+    extractTables: true,
+    ocr: new OcrConfig(
+        backend: 'tesseract',
+        language: 'eng',
+        tesseractConfig: new TesseractConfig(
+            enableTableDetection: true,
+            psm: 6
+        )
+    )
+);
+
+$kreuzberg = new Kreuzberg($ocrConfig);
+$result = $kreuzberg->extractFile('scanned_table.pdf');
+
+echo "OCR Table Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Tables with OCR: " . count($result->tables) . "\n\n";
+
+function processTable(array $cells): array
+{
+    $processed = [];
+
+    $headers = array_shift($cells);
+
+    foreach ($cells as $row) {
+        $rowData = [];
+        foreach ($headers as $index => $header) {
+            $rowData[$header] = $row[$index] ?? '';
+        }
+        $processed[] = $rowData;
+    }
+
+    return $processed;
+}
+
+foreach ($result->tables as $table) {
+    $structured = processTable($table->cells);
+
+    echo "Structured table data:\n";
+    echo json_encode($structured, JSON_PRETTY_PRINT) . "\n\n";
+}
+
+function findTablesWithKeyword(array $tables, string $keyword): array
+{
+    $matching = [];
+
+    foreach ($tables as $table) {
+        foreach ($table->cells as $row) {
+            foreach ($row as $cell) {
+                if (stripos($cell, $keyword) !== false) {
+                    $matching[] = $table;
+                    break 2;
+                }
+            }
+        }
+    }
+
+    return $matching;
+}
+
+$salesTables = findTablesWithKeyword($result->tables, 'sales');
+echo "Tables containing 'sales': " . count($salesTables) . "\n";
+
+function tableToAssociativeArray(\Kreuzberg\Types\Table $table): array
+{
+    $cells = $table->cells;
+    if (empty($cells)) {
+        return [];
+    }
+
+    $headers = array_shift($cells);
+    $result = [];
+
+    foreach ($cells as $row) {
+        $rowData = [];
+        foreach ($headers as $index => $header) {
+            $rowData[$header] = $row[$index] ?? null;
+        }
+        $result[] = $rowData;
+    }
+
+    return $result;
+}
+
+$result = $kreuzberg->extractFile('quarterly_report.pdf');
+
+foreach ($result->tables as $index => $table) {
+    $data = tableToAssociativeArray($table);
+
+    echo "\nTable " . ($index + 1) . " data:\n";
+
+    $totals = [];
+    foreach ($data as $row) {
+        foreach ($row as $key => $value) {
+            if (is_numeric($value)) {
+                if (!isset($totals[$key])) {
+                    $totals[$key] = 0;
+                }
+                $totals[$key] += floatval($value);
+            }
+        }
+    }
+
+    if (!empty($totals)) {
+        echo "Column totals:\n";
+        foreach ($totals as $column => $total) {
+            echo "  $column: " . number_format($total, 2) . "\n";
+        }
+    }
+}
+
+$allTablesJson = array_map(function ($table) {
+    return [
+        'page' => $table->pageNumber,
+        'rows' => count($table->cells),
+        'columns' => count($table->cells[0] ?? []),
+        'data' => tableToAssociativeArray($table),
+        'markdown' => $table->markdown,
+    ];
+}, $result->tables);
+
+file_put_contents('tables.json', json_encode($allTablesJson, JSON_PRETTY_PRINT));
+echo "\nAll tables exported to: tables.json\n";
+
+function mergeTables(array $tables): array
+{
+    if (empty($tables)) {
+        return [];
+    }
+
+    $merged = [];
+    $headers = $tables[0]->cells[0] ?? [];
+
+    foreach ($tables as $table) {
+        $cells = $table->cells;
+        array_shift($cells); 
+
+        foreach ($cells as $row) {
+            $merged[] = $row;
+        }
+    }
+
+    return ['headers' => $headers, 'data' => $merged];
+}
+
+$reportTables = findTablesWithKeyword($result->tables, 'Quarter');
+if (!empty($reportTables)) {
+    $merged = mergeTables($reportTables);
+    echo "\nMerged " . count($reportTables) . " tables\n";
+    echo "Total rows: " . count($merged['data']) . "\n";
+}
+```