Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,154 @@
```php title="batch_processing.php"
<?php
declare(strict_types=1);
/**
* Batch Document Processing
*
* Process multiple documents in parallel for maximum performance.
* Kreuzberg's batch API uses multiple threads to extract documents concurrently.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use function Kreuzberg\batch_extract_files;
use function Kreuzberg\batch_extract_bytes;
$files = [
'document1.pdf',
'document2.docx',
'document3.xlsx',
'presentation.pptx',
];
$files = array_filter($files, 'file_exists');
if (!empty($files)) {
echo "Processing " . count($files) . " files in batch...\n\n";
$start = microtime(true);
$results = batch_extract_files($files);
$elapsed = microtime(true) - $start;
echo "Batch extraction completed in " . number_format($elapsed, 3) . " seconds\n";
echo "Average: " . number_format($elapsed / count($files), 3) . " seconds per file\n\n";
foreach ($results as $index => $result) {
$filename = basename($files[$index]);
echo "$filename:\n";
echo " Content: " . strlen($result->content) . " chars\n";
echo " Tables: " . count($result->tables) . "\n";
echo " MIME: " . $result->mimeType . "\n\n";
}
}
$config = new ExtractionConfig(
extractTables: true,
extractImages: false
);
$kreuzberg = new Kreuzberg($config);
$pdfFiles = glob('*.pdf');
if (!empty($pdfFiles)) {
echo "Processing " . count($pdfFiles) . " PDF files...\n";
$start = microtime(true);
$results = $kreuzberg->batchExtractFiles($pdfFiles, $config);
$elapsed = microtime(true) - $start;
echo "Completed in " . number_format($elapsed, 2) . " seconds\n";
echo "Throughput: " . number_format(count($pdfFiles) / $elapsed, 2) . " files/second\n\n";
$totalChars = 0;
$totalTables = 0;
foreach ($results as $result) {
$totalChars += strlen($result->content);
$totalTables += count($result->tables);
}
echo "Total content: " . number_format($totalChars) . " characters\n";
echo "Total tables: $totalTables\n";
}
$uploadedFiles = [
['data' => file_get_contents('file1.pdf'), 'mime' => 'application/pdf'],
['data' => file_get_contents('file2.docx'), 'mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
];
$dataList = array_column($uploadedFiles, 'data');
$mimeTypes = array_column($uploadedFiles, 'mime');
$results = batch_extract_bytes($dataList, $mimeTypes);
echo "\nProcessed " . count($results) . " files from memory\n";
function processDirectory(string $dir, Kreuzberg $kreuzberg): array
{
$results = [];
$iterator = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($dir)
);
$files = [];
foreach ($iterator as $file) {
if ($file->isFile()) {
$ext = strtolower($file->getExtension());
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx', 'txt'], true)) {
$files[] = $file->getPathname();
}
}
}
if (empty($files)) {
return $results;
}
$batches = array_chunk($files, 10);
foreach ($batches as $batchIndex => $batch) {
echo "Processing batch " . ($batchIndex + 1) . "/" . count($batches) . "...\n";
$batchResults = $kreuzberg->batchExtractFiles($batch);
$results = array_merge($results, $batchResults);
}
return $results;
}
$directory = './documents';
if (is_dir($directory)) {
echo "\nProcessing directory: $directory\n";
$results = processDirectory($directory, $kreuzberg);
echo "Processed " . count($results) . " files\n";
}
$mixedFiles = ['valid.pdf', 'nonexistent.pdf', 'another.docx'];
try {
$results = batch_extract_files($mixedFiles);
} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
echo "Batch processing error: " . $e->getMessage() . "\n";
}
$allFiles = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
$batchSize = 5;
$batches = array_chunk($allFiles, $batchSize);
$totalProcessed = 0;
echo "\nProcessing " . count($allFiles) . " files in " . count($batches) . " batches...\n";
foreach ($batches as $index => $batch) {
$progress = (($index + 1) / count($batches)) * 100;
echo sprintf("\rProgress: %.1f%% [%d/%d batches]",
$progress, $index + 1, count($batches));
$results = $kreuzberg->batchExtractFiles($batch);
$totalProcessed += count($results);
}
echo "\n\nCompleted! Processed $totalProcessed files.\n";
```

View File

@@ -0,0 +1,118 @@
```php title="docx_extraction.php"
<?php
declare(strict_types=1);
/**
* DOCX (Word) Document Extraction
*
* Extract text, tables, and metadata from Microsoft Word documents.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use function Kreuzberg\extract_file;
$result = extract_file('document.docx');
echo "Word Document Extraction:\n";
echo str_repeat('=', 60) . "\n";
echo "Content:\n";
echo $result->content . "\n\n";
echo "Document Metadata:\n";
echo str_repeat('=', 60) . "\n";
echo "Title: " . ($result->metadata->title ?? 'N/A') . "\n";
echo "Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
echo "Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
echo "Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n";
echo "Subject: " . ($result->metadata->subject ?? 'N/A') . "\n";
echo "Keywords: " . implode(', ', $result->metadata->keywords ?? []) . "\n\n";
$config = new ExtractionConfig(
extractTables: true,
preserveFormatting: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('report.docx');
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . ":\n";
echo str_repeat('-', 60) . "\n";
foreach ($table->cells as $rowIndex => $row) {
echo implode(' | ', $row) . "\n";
if ($rowIndex === 0) {
echo str_repeat('-', 60) . "\n";
}
}
echo "\n";
}
$conversions = [
'plain' => null,
'markdown' => 'markdown',
];
foreach ($conversions as $name => $format) {
$config = new ExtractionConfig(
outputFormat: $format,
preserveFormatting: $format !== null
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.docx');
$outputFile = "output_$name.txt";
file_put_contents($outputFile, $result->content);
echo "Saved $name format to: $outputFile\n";
}
use function Kreuzberg\batch_extract_files;
$docxFiles = glob('*.docx');
if (!empty($docxFiles)) {
echo "\nBatch processing " . count($docxFiles) . " DOCX files...\n";
$results = batch_extract_files($docxFiles);
foreach ($results as $index => $result) {
$filename = basename($docxFiles[$index]);
echo "\n$filename:\n";
echo " Characters: " . strlen($result->content) . "\n";
echo " Tables: " . count($result->tables) . "\n";
echo " Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown') . "\n";
}
}
$result = extract_file('reviewed_document.docx');
if (!empty($result->metadata->createdBy)) {
echo "\nDocument Information:\n";
echo "Created by: " . $result->metadata->createdBy . "\n";
}
if (!empty($result->metadata->producer)) {
echo "Producer: " . $result->metadata->producer . "\n";
}
$result = extract_file('document.docx');
$content = $result->content;
$stats = [
'characters' => mb_strlen($content),
'words' => str_word_count($content),
'lines' => substr_count($content, "\n"),
'paragraphs' => substr_count($content, "\n\n"),
'sentences' => preg_match_all('/[.!?]+/', $content),
];
echo "\nDocument Statistics:\n";
echo str_repeat('=', 60) . "\n";
foreach ($stats as $metric => $value) {
echo ucfirst($metric) . ": " . number_format($value) . "\n";
}
```

View File

@@ -0,0 +1,288 @@
```php title="excel_extraction.php"
<?php
declare(strict_types=1);
/**
* Excel Spreadsheet Extraction
*
* This example demonstrates extracting content from Excel files (.xlsx, .xls).
* Excel spreadsheets are automatically converted to tables and text.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
echo "Example 1: Basic Excel Extraction\n";
echo "=================================\n";
$kreuzberg = new Kreuzberg();
$result = $kreuzberg->extractFile('financial_report.xlsx');
echo "Content:\n";
echo $result->content . "\n\n";
echo "Metadata:\n";
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
echo "- Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
echo "- Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n\n";
echo "Example 2: Extract Excel Tables\n";
echo "===============================\n";
$config2 = new ExtractionConfig(
extractTables: true
);
$result2 = (new Kreuzberg($config2))->extractFile('data.xlsx');
if (count($result2->tables) > 0) {
echo "Found " . count($result2->tables) . " table(s)\n\n";
foreach ($result2->tables as $i => $table) {
echo "Table " . ($i + 1) . " (Sheet/Page {$table->pageNumber}):\n";
echo $table->markdown . "\n\n";
echo "Raw data:\n";
echo "Rows: " . count($table->cells) . "\n";
echo "Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n\n";
}
}
echo "Example 3: Convert Excel to CSV\n";
echo "===============================\n";
$result3 = $kreuzberg->extractFile('spreadsheet.xlsx');
foreach ($result3->tables as $i => $table) {
$csvFilename = "sheet_{$i}.csv";
$fp = fopen($csvFilename, 'w');
foreach ($table->cells as $row) {
fputcsv($fp, $row);
}
fclose($fp);
echo "Saved: {$csvFilename}\n";
}
echo "\n";
echo "Example 4: Convert Excel to JSON\n";
echo "================================\n";
$result4 = $kreuzberg->extractFile('data.xlsx');
foreach ($result4->tables as $i => $table) {
$jsonData = [];
if (count($table->cells) > 0) {
$headers = $table->cells[0];
for ($j = 1; $j < count($table->cells); $j++) {
$row = $table->cells[$j];
$rowData = [];
for ($k = 0; $k < count($headers); $k++) {
$header = $headers[$k];
$value = $row[$k] ?? '';
$rowData[$header] = $value;
}
$jsonData[] = $rowData;
}
}
$jsonFilename = "sheet_{$i}.json";
file_put_contents($jsonFilename, json_encode($jsonData, JSON_PRETTY_PRINT));
echo "Saved: {$jsonFilename}\n";
}
echo "\n";
echo "Example 5: Process Multiple Sheets\n";
echo "==================================\n";
$result5 = $kreuzberg->extractFile('multi_sheet_workbook.xlsx');
echo "Total sheets/tables: " . count($result5->tables) . "\n\n";
foreach ($result5->tables as $i => $table) {
echo "Sheet " . ($i + 1) . ":\n";
echo "- Rows: " . count($table->cells) . "\n";
echo "- Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n";
if (count($table->cells) > 1) {
$numericColumns = [];
for ($col = 0; $col < count($table->cells[0]); $col++) {
$isNumeric = true;
for ($row = 1; $row < count($table->cells); $row++) {
$value = $table->cells[$row][$col] ?? '';
if (!is_numeric(trim($value)) && trim($value) !== '') {
$isNumeric = false;
break;
}
}
if ($isNumeric) {
$numericColumns[] = $col;
}
}
if (!empty($numericColumns)) {
echo "- Numeric columns: " . count($numericColumns) . "\n";
$col = $numericColumns[0];
$sum = 0;
for ($row = 1; $row < count($table->cells); $row++) {
$value = $table->cells[$row][$col] ?? '0';
$sum += (float) $value;
}
$columnName = $table->cells[0][$col] ?? "Column {$col}";
echo "- Sum of '{$columnName}': {$sum}\n";
}
}
echo "\n";
}
echo "Example 6: Extract Specific Data\n";
echo "================================\n";
$result6 = $kreuzberg->extractFile('budget.xlsx');
if (count($result6->tables) > 0) {
$table = $result6->tables[0];
echo "Header row:\n";
if (count($table->cells) > 0) {
print_r($table->cells[0]);
}
echo "\nFirst data row:\n";
if (count($table->cells) > 1) {
print_r($table->cells[1]);
}
if (count($table->cells) > 1 && count($table->cells[1]) > 2) {
$cellValue = $table->cells[1][2];
echo "\nCell [1][2]: {$cellValue}\n";
}
}
echo "\n";
echo "Example 7: Batch Process Excel Files\n";
echo "====================================\n";
$excelFiles = [
'january_sales.xlsx',
'february_sales.xlsx',
'march_sales.xlsx',
];
$results = $kreuzberg->batchExtractFiles($excelFiles);
$totalSheets = 0;
foreach ($results as $i => $result) {
$sheetCount = count($result->tables);
$totalSheets += $sheetCount;
echo "{$excelFiles[$i]}:\n";
echo "- Sheets: {$sheetCount}\n";
echo "- Text length: " . strlen($result->content) . " characters\n\n";
}
echo "Total sheets across all files: {$totalSheets}\n\n";
echo "Example 8: Convert Excel to HTML\n";
echo "================================\n";
$result8 = $kreuzberg->extractFile('report.xlsx');
foreach ($result8->tables as $i => $table) {
$html = "<table border='1'>\n";
foreach ($table->cells as $rowIndex => $row) {
$html .= " <tr>\n";
$tag = $rowIndex === 0 ? 'th' : 'td';
foreach ($row as $cell) {
$escapedCell = htmlspecialchars($cell);
$html .= " <{$tag}>{$escapedCell}</{$tag}>\n";
}
$html .= " </tr>\n";
}
$html .= "</table>\n";
$htmlFilename = "sheet_{$i}.html";
file_put_contents($htmlFilename, $html);
echo "Saved: {$htmlFilename}\n";
}
echo "\n";
echo "Example 9: Excel Metadata Extraction\n";
echo "====================================\n";
$result9 = $kreuzberg->extractFile('workbook.xlsx');
echo "File Metadata:\n";
echo "- Title: " . ($result9->metadata->title ?? 'N/A') . "\n";
echo "- Subject: " . ($result9->metadata->subject ?? 'N/A') . "\n";
echo "- Authors: " . (isset($result9->metadata->authors) ? implode(', ', $result9->metadata->authors) : 'N/A') . "\n";
echo "- Created: " . ($result9->metadata->createdAt ?? 'N/A') . "\n";
echo "- Modified: " . ($result9->metadata->modifiedAt ?? 'N/A') . "\n";
echo "- Created By: " . ($result9->metadata->createdBy ?? 'N/A') . "\n";
echo "- Keywords: " . (isset($result9->metadata->keywords) ? implode(', ', $result9->metadata->keywords) : 'N/A') . "\n";
if (!empty($result9->metadata->custom)) {
echo "\nCustom Properties:\n";
foreach ($result9->metadata->custom as $key => $value) {
echo "- {$key}: {$value}\n";
}
}
echo "\n";
echo "Example 10: Error Handling\n";
echo "=========================\n";
use Kreuzberg\Exceptions\KreuzbergException;
try {
$result = $kreuzberg->extractFile('protected.xlsx');
echo "Success: Extracted " . count($result->tables) . " sheets\n";
} catch (KreuzbergException $e) {
echo "Error: {$e->getMessage()}\n";
echo "Note: Password-protected files may require special handling\n";
}
echo "\n\nSupported Excel Formats:\n";
echo "========================\n";
echo "- .xlsx (Office Open XML)\n";
echo "- .xls (Legacy Excel format)\n";
echo "- .xlsm (Macro-enabled)\n";
echo "- .xlsb (Binary workbook)\n";
echo "- .xltx (Template)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "1. Excel tables are automatically detected as Table objects\n";
echo "2. Each sheet becomes a separate table\n";
echo "3. Use table->cells for programmatic access to cell data\n";
echo "4. Use table->markdown for human-readable output\n";
echo "5. First row is often headers - handle accordingly\n";
echo "6. Check for numeric columns to perform calculations\n";
echo "7. Export to CSV/JSON for database import\n";
echo "8. Use batch processing for multiple Excel files\n";
```

View File

@@ -0,0 +1,159 @@
```php title="image_extraction.php"
<?php
declare(strict_types=1);
/**
* Image Extraction from Documents
*
* Extract embedded images from PDFs, Office documents, and other formats.
* Optionally perform OCR on extracted images.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ImageExtractionConfig;
use Kreuzberg\Config\OcrConfig;
$config = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 100,
minHeight: 100
),
extractImages: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('presentation.pptx');
echo "Image Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Images found: " . count($result->images ?? []) . "\n\n";
foreach ($result->images ?? [] as $image) {
$filename = sprintf(
'extracted_p%d_i%d_%dx%d.%s',
$image->pageNumber,
$image->imageIndex,
$image->width,
$image->height,
$image->format
);
file_put_contents($filename, $image->data);
echo "Saved: $filename\n";
echo " Size: {$image->width}x{$image->height} pixels\n";
echo " Format: {$image->format}\n";
echo " Data: " . number_format(strlen($image->data)) . " bytes\n\n";
}
$ocrConfig = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
performOcr: true,
minWidth: 200,
minHeight: 100
),
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($ocrConfig);
$result = $kreuzberg->extractFile('scanned_images.pdf');
echo "Images with OCR:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->images ?? [] as $image) {
echo "Image {$image->imageIndex} from page {$image->pageNumber}:\n";
if ($image->ocrResult !== null) {
echo " OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
echo " OCR Length: " . strlen($image->ocrResult->content) . " chars\n";
} else {
echo " No OCR result\n";
}
echo "\n";
}
$largeImageConfig = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 500,
minHeight: 500
),
extractImages: true
);
$kreuzberg = new Kreuzberg($largeImageConfig);
$result = $kreuzberg->extractFile('photo_album.pdf');
echo "Large images (>500x500):\n";
foreach ($result->images ?? [] as $image) {
$filename = "large_image_{$image->imageIndex}.{$image->format}";
file_put_contents($filename, $image->data);
echo "Saved: $filename ({$image->width}x{$image->height})\n";
}
$result = $kreuzberg->extractFile('document.pdf');
$imageTypes = [];
foreach ($result->images ?? [] as $image) {
if (!isset($imageTypes[$image->format])) {
$imageTypes[$image->format] = [];
}
$imageTypes[$image->format][] = $image;
}
echo "\nImages by format:\n";
foreach ($imageTypes as $format => $images) {
echo " $format: " . count($images) . " images\n";
$dir = "images_$format";
if (!is_dir($dir)) {
mkdir($dir, 0755, true);
}
foreach ($images as $index => $image) {
$filename = "$dir/image_$index.$format";
file_put_contents($filename, $image->data);
}
echo " Saved to: $dir/\n";
}
if (extension_loaded('gd')) {
foreach ($result->images ?? [] as $image) {
if ($image->format === 'png' || $image->format === 'jpg') {
$gdImage = imagecreatefromstring($image->data);
if ($gdImage !== false) {
$width = imagesx($gdImage);
$height = imagesy($gdImage);
$thumbWidth = 200;
$thumbHeight = (int)(($height / $width) * $thumbWidth);
$thumb = imagecreatetruecolor($thumbWidth, $thumbHeight);
imagecopyresampled($thumb, $gdImage, 0, 0, 0, 0,
$thumbWidth, $thumbHeight, $width, $height);
$thumbFile = "thumb_{$image->imageIndex}.{$image->format}";
if ($image->format === 'png') {
imagepng($thumb, $thumbFile);
} else {
imagejpeg($thumb, $thumbFile, 85);
}
echo "Created thumbnail: $thumbFile\n";
imagedestroy($gdImage);
imagedestroy($thumb);
}
}
}
}
```

View File

@@ -0,0 +1,196 @@
```php title="metadata_extraction.php"
<?php
declare(strict_types=1);
/**
* Metadata Extraction
*
* Extract and process document metadata including title, author,
* creation date, keywords, and custom properties.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use function Kreuzberg\extract_file;
$result = extract_file('document.pdf');
$metadata = $result->metadata;
echo "Document Metadata:\n";
echo str_repeat('=', 60) . "\n";
echo "Title: " . ($metadata->title ?? 'N/A') . "\n";
echo "Authors: " . (isset($metadata->authors) ? implode(', ', $metadata->authors) : 'N/A') . "\n";
echo "Subject: " . ($metadata->subject ?? 'N/A') . "\n";
echo "Created By: " . ($metadata->createdBy ?? 'N/A') . "\n";
echo "Producer: " . ($metadata->producer ?? 'N/A') . "\n";
echo "Created: " . ($metadata->createdAt ?? 'N/A') . "\n";
echo "Modified: " . ($metadata->modifiedAt ?? 'N/A') . "\n";
echo "Page Count: " . ($metadata->pageCount ?? 'N/A') . "\n";
echo "Keywords: " . implode(', ', $metadata->keywords ?? []) . "\n";
echo "Language: " . ($metadata->language ?? 'N/A') . "\n\n";
$files = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
$metadataCollection = [];
foreach ($files as $file) {
$result = extract_file($file);
$metadataCollection[] = [
'file' => basename($file),
'title' => $result->metadata->title ?? 'Untitled',
'author' => isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown',
'created' => $result->metadata->createdAt ?? 'Unknown',
'pages' => $result->metadata->pageCount ?? 0,
'size' => filesize($file),
];
}
echo "Metadata Collection:\n";
echo str_repeat('=', 60) . "\n";
foreach ($metadataCollection as $meta) {
echo "{$meta['file']}:\n";
echo " Title: {$meta['title']}\n";
echo " Author: {$meta['author']}\n";
echo " Created: {$meta['created']}\n";
echo " Pages: {$meta['pages']}\n";
echo " Size: " . number_format($meta['size'] / 1024, 2) . " KB\n\n";
}
function searchByAuthor(array $collection, string $author): array
{
return array_filter($collection, function ($meta) use ($author) {
return stripos($meta['author'], $author) !== false;
});
}
function searchByDateRange(array $collection, string $start, string $end): array
{
return array_filter($collection, function ($meta) use ($start, $end) {
$created = $meta['created'];
if ($created === 'Unknown') {
return false;
}
$dateOnly = substr($created, 0, 10);
return $dateOnly >= $start && $dateOnly <= $end;
});
}
$johnDocs = searchByAuthor($metadataCollection, 'John');
echo "Documents by John: " . count($johnDocs) . "\n";
$recentDocs = searchByDateRange($metadataCollection, '2024-01-01', '2024-12-31');
echo "Documents from 2024: " . count($recentDocs) . "\n\n";
function generateCatalog(array $collection): string
{
$html = "<html><head><title>Document Catalog</title></head><body>\n";
$html .= "<h1>Document Catalog</h1>\n";
$html .= "<table border='1'>\n";
$html .= "<tr><th>File</th><th>Title</th><th>Author</th><th>Created</th><th>Pages</th></tr>\n";
foreach ($collection as $meta) {
$html .= "<tr>";
$html .= "<td>" . htmlspecialchars($meta['file']) . "</td>";
$html .= "<td>" . htmlspecialchars($meta['title']) . "</td>";
$html .= "<td>" . htmlspecialchars($meta['author']) . "</td>";
$html .= "<td>" . htmlspecialchars($meta['created']) . "</td>";
$html .= "<td>" . htmlspecialchars((string)$meta['pages']) . "</td>";
$html .= "</tr>\n";
}
$html .= "</table>\n</body></html>";
return $html;
}
$catalog = generateCatalog($metadataCollection);
file_put_contents('catalog.html', $catalog);
echo "Catalog saved to: catalog.html\n";
function exportMetadataToCSV(array $collection, string $filename): void
{
$fp = fopen($filename, 'w');
fputcsv($fp, ['File', 'Title', 'Author', 'Created', 'Pages', 'Size (KB)']);
foreach ($collection as $meta) {
fputcsv($fp, [
$meta['file'],
$meta['title'],
$meta['author'],
$meta['created'],
$meta['pages'],
number_format($meta['size'] / 1024, 2),
]);
}
fclose($fp);
}
exportMetadataToCSV($metadataCollection, 'metadata.csv');
echo "Metadata exported to: metadata.csv\n";
$totalPages = array_sum(array_column($metadataCollection, 'pages'));
$totalSize = array_sum(array_column($metadataCollection, 'size'));
$authors = array_unique(array_column($metadataCollection, 'author'));
echo "\nCollection Statistics:\n";
echo str_repeat('=', 60) . "\n";
echo "Total documents: " . count($metadataCollection) . "\n";
echo "Total pages: " . number_format($totalPages) . "\n";
echo "Total size: " . number_format($totalSize / 1024 / 1024, 2) . " MB\n";
echo "Unique authors: " . count($authors) . "\n";
echo "Average pages per document: " . number_format($totalPages / count($metadataCollection), 1) . "\n";
$byAuthor = [];
foreach ($metadataCollection as $meta) {
$author = $meta['author'];
if (!isset($byAuthor[$author])) {
$byAuthor[$author] = [];
}
$byAuthor[$author][] = $meta;
}
echo "\nDocuments by Author:\n";
echo str_repeat('=', 60) . "\n";
foreach ($byAuthor as $author => $docs) {
echo "$author: " . count($docs) . " documents\n";
}
function validateMetadata(array $meta): array
{
$issues = [];
if (empty($meta['title']) || $meta['title'] === 'Untitled') {
$issues[] = 'Missing title';
}
if (empty($meta['author']) || $meta['author'] === 'Unknown') {
$issues[] = 'Missing author';
}
if (empty($meta['created']) || $meta['created'] === 'Unknown') {
$issues[] = 'Missing creation date';
}
if ($meta['pages'] === 0) {
$issues[] = 'Invalid page count';
}
return $issues;
}
echo "\nMetadata Quality Check:\n";
echo str_repeat('=', 60) . "\n";
$incomplete = 0;
foreach ($metadataCollection as $meta) {
$issues = validateMetadata($meta);
if (!empty($issues)) {
$incomplete++;
echo "{$meta['file']}: " . implode(', ', $issues) . "\n";
}
}
echo "\nIncomplete metadata: $incomplete/" . count($metadataCollection) . " documents\n";
```

View File

@@ -0,0 +1,282 @@
```php title="multi_format.php"
<?php
declare(strict_types=1);
/**
* Multi-Format Document Extraction
*
* Handle various document formats (PDF, DOCX, XLSX, PPTX, images, etc.)
* with format-specific processing and unified output.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use function Kreuzberg\extract_file;
use function Kreuzberg\detect_mime_type_from_path;
$formats = [
'PDF' => 'document.pdf',
'Word' => 'document.docx',
'Excel' => 'spreadsheet.xlsx',
'PowerPoint' => 'presentation.pptx',
'Text' => 'readme.txt',
'HTML' => 'page.html',
'Markdown' => 'guide.md',
'Image' => 'scan.png',
];
echo "Multi-Format Extraction:\n";
echo str_repeat('=', 60) . "\n\n";
$kreuzberg = new Kreuzberg();
foreach ($formats as $type => $file) {
if (!file_exists($file)) {
continue;
}
echo "Processing $type ($file):\n";
$mimeType = detect_mime_type_from_path($file);
echo " MIME type: $mimeType\n";
$result = $kreuzberg->extractFile($file);
echo " Content length: " . strlen($result->content) . " chars\n";
echo " Tables: " . count($result->tables) . "\n";
echo " Images: " . count($result->images ?? []) . "\n";
echo " Pages: " . ($result->metadata->pageCount ?? 'N/A') . "\n";
echo "\n";
}
$mixedFiles = glob('documents/*.*');
$byFormat = [];
foreach ($mixedFiles as $file) {
$mimeType = detect_mime_type_from_path($file);
$extension = pathinfo($file, PATHINFO_EXTENSION);
if (!isset($byFormat[$extension])) {
$byFormat[$extension] = [];
}
$result = extract_file($file);
$byFormat[$extension][] = [
'file' => basename($file),
'mime' => $mimeType,
'size' => strlen($result->content),
'tables' => count($result->tables),
];
}
echo "Files by Format:\n";
echo str_repeat('=', 60) . "\n";
foreach ($byFormat as $ext => $files) {
echo strtoupper($ext) . ": " . count($files) . " files\n";
$totalSize = array_sum(array_column($files, 'size'));
$totalTables = array_sum(array_column($files, 'tables'));
echo " Total content: " . number_format($totalSize) . " chars\n";
echo " Total tables: $totalTables\n\n";
}
$formatConfigs = [
'pdf' => new ExtractionConfig(
extractTables: true,
extractImages: true,
pdf: new \Kreuzberg\Config\PdfConfig(
extractImages: true,
imageQuality: 85
)
),
'docx' => new ExtractionConfig(
extractTables: true,
preserveFormatting: true
),
'xlsx' => new ExtractionConfig(
extractTables: true
),
'png' => new ExtractionConfig(
ocr: new \Kreuzberg\Config\OcrConfig(
backend: 'tesseract',
language: 'eng'
)
),
];
foreach ($mixedFiles as $file) {
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
if (!isset($formatConfigs[$ext])) {
continue;
}
$config = $formatConfigs[$ext];
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
echo "Processed " . basename($file) . " with $ext config\n";
}
function convertToMarkdown(string $inputFile): string
{
$config = new ExtractionConfig(
preserveFormatting: true,
outputFormat: 'markdown',
extractTables: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($inputFile);
$markdown = "# " . ($result->metadata->title ?? basename($inputFile)) . "\n\n";
if (isset($result->metadata->authors)) {
$markdown .= "_Authors: " . implode(', ', $result->metadata->authors) . "_\n\n";
}
$markdown .= $result->content . "\n\n";
foreach ($result->tables as $index => $table) {
$markdown .= "## Table " . ($index + 1) . "\n\n";
$markdown .= $table->markdown . "\n\n";
}
return $markdown;
}
echo "\nConverting to Markdown:\n";
echo str_repeat('=', 60) . "\n";
foreach (['document.pdf', 'document.docx'] as $file) {
if (!file_exists($file)) {
continue;
}
$markdown = convertToMarkdown($file);
$outputFile = pathinfo($file, PATHINFO_FILENAME) . '.md';
file_put_contents($outputFile, $markdown);
echo "Converted: $file -> $outputFile\n";
}
function extractFromArchive(string $archiveFile): array
{
$result = extract_file($archiveFile);
return [
'archive' => basename($archiveFile),
'listing' => $result->content,
'mime' => $result->mimeType,
];
}
class UniversalExtractor
{
private Kreuzberg $kreuzberg;
private array $formatHandlers = [];
public function __construct()
{
$this->kreuzberg = new Kreuzberg();
$this->formatHandlers = [
'application/pdf' => [$this, 'handlePDF'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => [$this, 'handleDOCX'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => [$this, 'handleXLSX'],
'image/png' => [$this, 'handleImage'],
'image/jpeg' => [$this, 'handleImage'],
];
}
public function extract(string $file): array
{
$mimeType = detect_mime_type_from_path($file);
$handler = $this->formatHandlers[$mimeType] ?? [$this, 'handleGeneric'];
return $handler($file, $mimeType);
}
private function handlePDF(string $file, string $mimeType): array
{
$config = new ExtractionConfig(extractTables: true, extractImages: true);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
return [
'type' => 'PDF',
'content' => $result->content,
'tables' => count($result->tables),
'images' => count($result->images ?? []),
'pages' => $result->metadata->pageCount,
];
}
private function handleDOCX(string $file, string $mimeType): array
{
$result = $this->kreuzberg->extractFile($file);
return [
'type' => 'Word Document',
'content' => $result->content,
'tables' => count($result->tables),
'authors' => $result->metadata->authors,
];
}
private function handleXLSX(string $file, string $mimeType): array
{
$config = new ExtractionConfig(extractTables: true);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
return [
'type' => 'Excel Spreadsheet',
'content' => $result->content,
'sheets' => count($result->tables),
];
}
private function handleImage(string $file, string $mimeType): array
{
$config = new ExtractionConfig(
ocr: new \Kreuzberg\Config\OcrConfig(backend: 'tesseract', language: 'eng')
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($file);
return [
'type' => 'Image (OCR)',
'content' => $result->content,
'ocr_length' => strlen($result->content),
];
}
private function handleGeneric(string $file, string $mimeType): array
{
$result = $this->kreuzberg->extractFile($file);
return [
'type' => 'Generic',
'mime' => $mimeType,
'content' => $result->content,
];
}
}
$extractor = new UniversalExtractor();
echo "\nUniversal Extraction:\n";
echo str_repeat('=', 60) . "\n";
foreach ($mixedFiles as $file) {
$data = $extractor->extract($file);
echo basename($file) . " ({$data['type']}):\n";
print_r(array_filter($data, fn($k) => $k !== 'content', ARRAY_FILTER_USE_KEY));
echo "\n";
}
```

View File

@@ -0,0 +1,114 @@
```php title="pdf_extraction.php"
<?php
declare(strict_types=1);
/**
* PDF Document Extraction
*
* Extract text, tables, and images from PDF files with various configurations.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
$kreuzberg = new Kreuzberg();
$result = $kreuzberg->extractFile('document.pdf');
echo "PDF Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Content length: " . strlen($result->content) . " characters\n";
echo "Tables found: " . count($result->tables) . "\n";
echo "Pages: " . ($result->metadata->pageCount ?? 'unknown') . "\n\n";
$config = new ExtractionConfig(
extractImages: true,
extractTables: true,
pdf: new PdfConfig(
extractImages: true,
imageQuality: 85
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('report.pdf');
echo "Extracted Tables:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
echo "Rows: " . count($table->cells) . "\n";
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
echo "Markdown format:\n";
echo $table->markdown . "\n\n";
$csvFile = "table_{$index}.csv";
$fp = fopen($csvFile, 'w');
foreach ($table->cells as $row) {
fputcsv($fp, $row);
}
fclose($fp);
echo "Saved to: $csvFile\n\n";
}
echo "Extracted Images:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->images ?? [] as $image) {
$filename = sprintf(
'page_%d_image_%d.%s',
$image->pageNumber,
$image->imageIndex,
$image->format
);
file_put_contents($filename, $image->data);
echo "Saved: $filename\n";
echo " Size: {$image->width}x{$image->height}\n";
echo " Format: {$image->format}\n";
echo " Data size: " . strlen($image->data) . " bytes\n\n";
}
$formattedConfig = new ExtractionConfig(
preserveFormatting: true,
outputFormat: 'markdown'
);
$kreuzberg = new Kreuzberg($formattedConfig);
$result = $kreuzberg->extractFile('formatted.pdf');
file_put_contents('output.md', $result->content);
echo "Saved formatted output to: output.md\n";
$result = $kreuzberg->extractFile('document.pdf');
$content = $result->content;
$sections = [];
$lines = explode("\n", $content);
$currentSection = null;
$currentContent = [];
foreach ($lines as $line) {
if (preg_match('/^#+\s+(.+)$/', $line, $matches)) {
if ($currentSection !== null) {
$sections[$currentSection] = implode("\n", $currentContent);
}
$currentSection = $matches[1];
$currentContent = [];
} else {
$currentContent[] = $line;
}
}
if ($currentSection !== null) {
$sections[$currentSection] = implode("\n", $currentContent);
}
echo "\nDocument sections:\n";
foreach ($sections as $title => $content) {
echo " - $title (" . strlen($content) . " chars)\n";
}
```

View File

@@ -0,0 +1,195 @@
```php title="powerpoint_extraction.php"
<?php
declare(strict_types=1);
/**
* PowerPoint Presentation Extraction
*
* This example demonstrates extracting content from PowerPoint files (.pptx, .ppt),
* including text, notes, images, and tables from slides.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ImageExtractionConfig;
use Kreuzberg\Config\PageConfig;
echo "Example 1: Basic PowerPoint Extraction\n";
echo "======================================\n";
$kreuzberg = new Kreuzberg();
$result = $kreuzberg->extractFile('presentation.pptx');
echo "Content:\n";
echo $result->content . "\n\n";
echo "Metadata:\n";
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
echo "- Slide Count: " . ($result->metadata->pageCount ?? 'N/A') . "\n\n";
echo "Example 2: Extract Per-Slide Content\n";
echo "====================================\n";
$config2 = new ExtractionConfig(
page: new PageConfig(
extractPages: true,
insertPageMarkers: true,
markerFormat: '--- Slide {page_number} ---'
)
);
$result2 = (new Kreuzberg($config2))->extractFile('presentation.pptx');
if ($result2->pages !== null) {
echo "Total slides: " . count($result2->pages) . "\n\n";
foreach ($result2->pages as $page) {
echo "Slide {$page->pageNumber}:\n";
echo "- Text length: " . strlen($page->content) . " characters\n";
echo "- Tables: " . count($page->tables) . "\n";
echo "- Images: " . count($page->images) . "\n";
echo "- Content preview: " . substr($page->content, 0, 100) . "...\n\n";
}
}
echo "Example 3: Extract Images from Slides\n";
echo "=====================================\n";
$config3 = new ExtractionConfig(
imageExtraction: new ImageExtractionConfig(
extractImages: true,
minWidth: 100,
minHeight: 100
)
);
$result3 = (new Kreuzberg($config3))->extractFile('presentation.pptx');
if ($result3->images !== null) {
echo "Total images: " . count($result3->images) . "\n\n";
foreach ($result3->images as $i => $image) {
echo "Image {$i}:\n";
echo "- Format: {$image->format}\n";
echo "- Size: {$image->width}x{$image->height}\n";
echo "- Slide: {$image->pageNumber}\n";
$filename = "slide_{$image->pageNumber}_image_{$i}.{$image->format}";
file_put_contents($filename, base64_decode($image->data));
echo "- Saved: {$filename}\n\n";
}
}
echo "Example 4: Extract Tables from Slides\n";
echo "=====================================\n";
$config4 = new ExtractionConfig(
extractTables: true
);
$result4 = (new Kreuzberg($config4))->extractFile('data_presentation.pptx');
if (count($result4->tables) > 0) {
echo "Found " . count($result4->tables) . " table(s)\n\n";
foreach ($result4->tables as $i => $table) {
echo "Table " . ($i + 1) . " (Slide {$table->pageNumber}):\n";
echo $table->markdown . "\n\n";
}
}
echo "Example 5: Convert PowerPoint to Markdown\n";
echo "=========================================\n";
$config5 = new ExtractionConfig(
page: new PageConfig(
extractPages: true,
insertPageMarkers: true,
markerFormat: '---\n\n## Slide {page_number}\n\n'
),
outputFormat: 'markdown'
);
$result5 = (new Kreuzberg($config5))->extractFile('presentation.pptx');
$markdownContent = $result5->content;
file_put_contents('presentation.md', $markdownContent);
echo "Converted to Markdown\n";
echo "Saved as: presentation.md\n";
echo "Content preview:\n";
echo substr($markdownContent, 0, 500) . "...\n\n";
echo "Example 6: Generate Presentation Summary\n";
echo "========================================\n";
$config6 = new ExtractionConfig(
page: new PageConfig(extractPages: true)
);
$result6 = (new Kreuzberg($config6))->extractFile('meeting_deck.pptx');
echo "Presentation Summary:\n";
echo "====================\n";
echo "Title: " . ($result6->metadata->title ?? 'Untitled') . "\n";
echo "Author: " . (isset($result6->metadata->authors) ? implode(', ', $result6->metadata->authors) : 'Unknown') . "\n";
echo "Total Slides: " . ($result6->metadata->pageCount ?? count($result6->pages ?? [])) . "\n";
echo "Total Text: " . strlen($result6->content) . " characters\n";
echo "Tables: " . count($result6->tables) . "\n";
if ($result6->pages !== null) {
echo "\nSlide Breakdown:\n";
foreach ($result6->pages as $page) {
$wordCount = str_word_count($page->content);
echo "- Slide {$page->pageNumber}: {$wordCount} words, " . count($page->tables) . " tables\n";
}
}
echo "\n";
echo "Example 7: Search Content in Slides\n";
echo "===================================\n";
$config7 = new ExtractionConfig(
page: new PageConfig(extractPages: true)
);
$result7 = (new Kreuzberg($config7))->extractFile('presentation.pptx');
$searchTerm = "revenue";
if ($result7->pages !== null) {
echo "Searching for '{$searchTerm}':\n\n";
foreach ($result7->pages as $page) {
if (stripos($page->content, $searchTerm) !== false) {
echo "Found in Slide {$page->pageNumber}:\n";
$pos = stripos($page->content, $searchTerm);
$context = substr($page->content, max(0, $pos - 50), 150);
echo "- Context: ...{$context}...\n\n";
}
}
}
echo "\nSupported PowerPoint Formats:\n";
echo "=============================\n";
echo "- .pptx (PowerPoint 2007+)\n";
echo "- .ppt (PowerPoint 97-2003)\n";
echo "- .pptm (Macro-enabled)\n";
echo "- .potx (Template)\n";
echo "\n\nBest Practices:\n";
echo "===============\n";
echo "1. Use page extraction to process individual slides\n";
echo "2. Extract images for visual content analysis\n";
echo "3. Extract tables for data analysis\n";
echo "4. Use metadata for presentation information\n";
echo "5. Convert to Markdown for documentation\n";
echo "6. Search across slides for specific content\n";
echo "7. Generate summaries for presentation overviews\n";
```

View File

@@ -0,0 +1,217 @@
```php title="table_extraction.php"
<?php
declare(strict_types=1);
/**
* Table Extraction and Processing
*
* Extract tables from PDFs and other documents, process them,
* and export to various formats (CSV, JSON, HTML).
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
$config = new ExtractionConfig(extractTables: true);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('financial_report.pdf');
echo "Table Extraction:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
echo str_repeat('-', 60) . "\n";
echo "Markdown:\n";
echo $table->markdown . "\n\n";
echo "Array format:\n";
echo "Rows: " . count($table->cells) . "\n";
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
echo "HTML:\n";
echo "<table>\n";
foreach ($table->cells as $rowIndex => $row) {
$tag = $rowIndex === 0 ? 'th' : 'td';
echo " <tr>\n";
foreach ($row as $cell) {
echo " <$tag>" . htmlspecialchars($cell) . "</$tag>\n";
}
echo " </tr>\n";
}
echo "</table>\n\n";
}
foreach ($result->tables as $index => $table) {
$filename = "table_" . ($index + 1) . "_page_" . $table->pageNumber . ".csv";
$fp = fopen($filename, 'w');
foreach ($table->cells as $row) {
fputcsv($fp, $row);
}
fclose($fp);
echo "Exported to: $filename\n";
}
echo "\n";
$ocrConfig = new ExtractionConfig(
extractTables: true,
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
enableTableDetection: true,
psm: 6
)
)
);
$kreuzberg = new Kreuzberg($ocrConfig);
$result = $kreuzberg->extractFile('scanned_table.pdf');
echo "OCR Table Extraction:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables with OCR: " . count($result->tables) . "\n\n";
function processTable(array $cells): array
{
$processed = [];
$headers = array_shift($cells);
foreach ($cells as $row) {
$rowData = [];
foreach ($headers as $index => $header) {
$rowData[$header] = $row[$index] ?? '';
}
$processed[] = $rowData;
}
return $processed;
}
foreach ($result->tables as $table) {
$structured = processTable($table->cells);
echo "Structured table data:\n";
echo json_encode($structured, JSON_PRETTY_PRINT) . "\n\n";
}
function findTablesWithKeyword(array $tables, string $keyword): array
{
$matching = [];
foreach ($tables as $table) {
foreach ($table->cells as $row) {
foreach ($row as $cell) {
if (stripos($cell, $keyword) !== false) {
$matching[] = $table;
break 2;
}
}
}
}
return $matching;
}
$salesTables = findTablesWithKeyword($result->tables, 'sales');
echo "Tables containing 'sales': " . count($salesTables) . "\n";
function tableToAssociativeArray(\Kreuzberg\Types\Table $table): array
{
$cells = $table->cells;
if (empty($cells)) {
return [];
}
$headers = array_shift($cells);
$result = [];
foreach ($cells as $row) {
$rowData = [];
foreach ($headers as $index => $header) {
$rowData[$header] = $row[$index] ?? null;
}
$result[] = $rowData;
}
return $result;
}
$result = $kreuzberg->extractFile('quarterly_report.pdf');
foreach ($result->tables as $index => $table) {
$data = tableToAssociativeArray($table);
echo "\nTable " . ($index + 1) . " data:\n";
$totals = [];
foreach ($data as $row) {
foreach ($row as $key => $value) {
if (is_numeric($value)) {
if (!isset($totals[$key])) {
$totals[$key] = 0;
}
$totals[$key] += floatval($value);
}
}
}
if (!empty($totals)) {
echo "Column totals:\n";
foreach ($totals as $column => $total) {
echo " $column: " . number_format($total, 2) . "\n";
}
}
}
$allTablesJson = array_map(function ($table) {
return [
'page' => $table->pageNumber,
'rows' => count($table->cells),
'columns' => count($table->cells[0] ?? []),
'data' => tableToAssociativeArray($table),
'markdown' => $table->markdown,
];
}, $result->tables);
file_put_contents('tables.json', json_encode($allTablesJson, JSON_PRETTY_PRINT));
echo "\nAll tables exported to: tables.json\n";
function mergeTables(array $tables): array
{
if (empty($tables)) {
return [];
}
$merged = [];
$headers = $tables[0]->cells[0] ?? [];
foreach ($tables as $table) {
$cells = $table->cells;
array_shift($cells);
foreach ($cells as $row) {
$merged[] = $row;
}
}
return ['headers' => $headers, 'data' => $merged];
}
$reportTables = findTablesWithKeyword($result->tables, 'Quarter');
if (!empty($reportTables)) {
$merged = mergeTables($reportTables);
echo "\nMerged " . count($reportTables) . " tables\n";
echo "Total rows: " . count($merged['data']) . "\n";
}
```