Files
fil/docs/snippets/php/utils/tables.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

238 lines
6.1 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
```php title="tables.php"
<?php
declare(strict_types=1);
/**
* Table Extraction and Processing
*
* Extract tables from documents and convert them to various formats.
* Demonstrates table processing, formatting, and export capabilities.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Result\ExtractedTable;
$config = new ExtractionConfig(
extractTables: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Table Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";
foreach ($result->tables as $tableIndex => $table) {
echo "Table " . ($tableIndex + 1) . ":\n";
echo str_repeat('-', 40) . "\n";
$rowCount = count($table->cells);
$colCount = !empty($table->cells) ? count($table->cells[0]) : 0;
echo " Dimensions: $rowCount rows × $colCount columns\n";
if (isset($table->pageNumber)) {
echo " Page: {$table->pageNumber}\n";
}
echo "\n";
echo " Markdown representation:\n";
echo str_repeat('-', 40) . "\n";
echo $table->markdown . "\n\n";
echo " Raw data preview:\n";
echo str_repeat('-', 40) . "\n";
$previewRows = array_slice($table->cells, 0, 3);
foreach ($previewRows as $rowIndex => $row) {
echo " Row " . ($rowIndex + 1) . ": [" . implode(' | ', $row) . "]\n";
}
if ($rowCount > 3) {
echo " ... and " . ($rowCount - 3) . " more rows\n";
}
echo "\n";
}
echo "Exporting Tables to CSV:\n";
echo str_repeat('=', 60) . "\n";
$outputDir = './exported_tables';
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.csv', $index + 1);
$filepath = $outputDir . '/' . $filename;
$fp = fopen($filepath, 'w');
if ($fp !== false) {
foreach ($table->cells as $row) {
fputcsv($fp, $row);
}
fclose($fp);
echo "Saved: $filename\n";
} else {
echo "Error: Failed to create $filename\n";
}
}
echo "\n";
echo "Exporting Tables to JSON:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.json', $index + 1);
$filepath = $outputDir . '/' . $filename;
$tableData = [
'index' => $index + 1,
'page' => $table->pageNumber ?? null,
'dimensions' => [
'rows' => count($table->cells),
'columns' => !empty($table->cells) ? count($table->cells[0]) : 0,
],
'data' => $table->cells,
'markdown' => $table->markdown,
];
$json = json_encode($tableData, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
file_put_contents($filepath, $json);
echo "Saved: $filename\n";
}
echo "\n";
function tableToHtml(ExtractedTable $table): string
{
$html = "<table>\n";
foreach ($table->cells as $rowIndex => $row) {
$html .= " <tr>\n";
$tag = $rowIndex === 0 ? 'th' : 'td';
foreach ($row as $cell) {
$escapedCell = htmlspecialchars($cell, ENT_QUOTES, 'UTF-8');
$html .= " <$tag>$escapedCell</$tag>\n";
}
$html .= " </tr>\n";
}
$html .= "</table>";
return $html;
}
echo "Exporting Tables to HTML:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.html', $index + 1);
$filepath = $outputDir . '/' . $filename;
$html = "<!DOCTYPE html>\n";
$html .= "<html>\n<head>\n";
$html .= " <meta charset=\"UTF-8\">\n";
$html .= " <title>Table " . ($index + 1) . "</title>\n";
$html .= " <style>\n";
$html .= " table { border-collapse: collapse; width: 100%; }\n";
$html .= " th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n";
$html .= " th { background-color: #f2f2f2; }\n";
$html .= " </style>\n";
$html .= "</head>\n<body>\n";
$html .= " <h1>Table " . ($index + 1) . "</h1>\n";
$html .= tableToHtml($table) . "\n";
$html .= "</body>\n</html>";
file_put_contents($filepath, $html);
echo "Saved: $filename\n";
}
echo "\n";
echo "Table Analysis:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . " Analysis:\n";
$cells = $table->cells;
$totalCells = array_sum(array_map('count', $cells));
$emptyCells = 0;
$numericCells = 0;
foreach ($cells as $row) {
foreach ($row as $cell) {
if (empty(trim($cell))) {
$emptyCells++;
}
if (is_numeric($cell)) {
$numericCells++;
}
}
}
echo " Total cells: $totalCells\n";
echo " Empty cells: $emptyCells (" . number_format(($emptyCells / max($totalCells, 1)) * 100, 1) . "%)\n";
echo " Numeric cells: $numericCells (" . number_format(($numericCells / max($totalCells, 1)) * 100, 1) . "%)\n";
$numericRatio = $numericCells / max($totalCells, 1);
$tableType = match(true) {
$numericRatio > 0.5 => 'Data/Numeric Table',
$numericRatio > 0.2 => 'Mixed Content Table',
default => 'Text Table',
};
echo " Table type: $tableType\n\n";
}
function tableToAssociativeArray(ExtractedTable $table): array
{
if (empty($table->cells)) {
return [];
}
$headers = array_shift($table->cells);
$data = [];
foreach ($table->cells as $row) {
$rowData = [];
foreach ($headers as $index => $header) {
$rowData[$header] = $row[$index] ?? '';
}
$data[] = $rowData;
}
return $data;
}
if (!empty($result->tables)) {
$firstTable = $result->tables[0];
$associativeData = tableToAssociativeArray($firstTable);
echo "First Table as Associative Array:\n";
echo str_repeat('=', 60) . "\n";
echo json_encode(array_slice($associativeData, 0, 3), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . "\n";
if (count($associativeData) > 3) {
echo "... and " . (count($associativeData) - 3) . " more records\n";
}
}
```