```php title="table_extraction.php" extractFile('financial_report.pdf'); echo "Table Extraction:\n"; echo str_repeat('=', 60) . "\n"; echo "Tables found: " . count($result->tables) . "\n\n"; foreach ($result->tables as $index => $table) { echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n"; echo str_repeat('-', 60) . "\n"; echo "Markdown:\n"; echo $table->markdown . "\n\n"; echo "Array format:\n"; echo "Rows: " . count($table->cells) . "\n"; echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n"; echo "HTML:\n"; echo "\n"; foreach ($table->cells as $rowIndex => $row) { $tag = $rowIndex === 0 ? 'th' : 'td'; echo " \n"; foreach ($row as $cell) { echo " <$tag>" . htmlspecialchars($cell) . "\n"; } echo " \n"; } echo "
\n\n"; } foreach ($result->tables as $index => $table) { $filename = "table_" . ($index + 1) . "_page_" . $table->pageNumber . ".csv"; $fp = fopen($filename, 'w'); foreach ($table->cells as $row) { fputcsv($fp, $row); } fclose($fp); echo "Exported to: $filename\n"; } echo "\n"; $ocrConfig = new ExtractionConfig( extractTables: true, ocr: new OcrConfig( backend: 'tesseract', language: 'eng', tesseractConfig: new TesseractConfig( enableTableDetection: true, psm: 6 ) ) ); $kreuzberg = new Kreuzberg($ocrConfig); $result = $kreuzberg->extractFile('scanned_table.pdf'); echo "OCR Table Extraction:\n"; echo str_repeat('=', 60) . "\n"; echo "Tables with OCR: " . count($result->tables) . "\n\n"; function processTable(array $cells): array { $processed = []; $headers = array_shift($cells); foreach ($cells as $row) { $rowData = []; foreach ($headers as $index => $header) { $rowData[$header] = $row[$index] ?? ''; } $processed[] = $rowData; } return $processed; } foreach ($result->tables as $table) { $structured = processTable($table->cells); echo "Structured table data:\n"; echo json_encode($structured, JSON_PRETTY_PRINT) . "\n\n"; } function findTablesWithKeyword(array $tables, string $keyword): array { $matching = []; foreach ($tables as $table) { foreach ($table->cells as $row) { foreach ($row as $cell) { if (stripos($cell, $keyword) !== false) { $matching[] = $table; break 2; } } } } return $matching; } $salesTables = findTablesWithKeyword($result->tables, 'sales'); echo "Tables containing 'sales': " . count($salesTables) . "\n"; function tableToAssociativeArray(\Kreuzberg\Types\Table $table): array { $cells = $table->cells; if (empty($cells)) { return []; } $headers = array_shift($cells); $result = []; foreach ($cells as $row) { $rowData = []; foreach ($headers as $index => $header) { $rowData[$header] = $row[$index] ?? null; } $result[] = $rowData; } return $result; } $result = $kreuzberg->extractFile('quarterly_report.pdf'); foreach ($result->tables as $index => $table) { $data = tableToAssociativeArray($table); echo "\nTable " . ($index + 1) . " data:\n"; $totals = []; foreach ($data as $row) { foreach ($row as $key => $value) { if (is_numeric($value)) { if (!isset($totals[$key])) { $totals[$key] = 0; } $totals[$key] += floatval($value); } } } if (!empty($totals)) { echo "Column totals:\n"; foreach ($totals as $column => $total) { echo " $column: " . number_format($total, 2) . "\n"; } } } $allTablesJson = array_map(function ($table) { return [ 'page' => $table->pageNumber, 'rows' => count($table->cells), 'columns' => count($table->cells[0] ?? []), 'data' => tableToAssociativeArray($table), 'markdown' => $table->markdown, ]; }, $result->tables); file_put_contents('tables.json', json_encode($allTablesJson, JSON_PRETTY_PRINT)); echo "\nAll tables exported to: tables.json\n"; function mergeTables(array $tables): array { if (empty($tables)) { return []; } $merged = []; $headers = $tables[0]->cells[0] ?? []; foreach ($tables as $table) { $cells = $table->cells; array_shift($cells); foreach ($cells as $row) { $merged[] = $row; } } return ['headers' => $headers, 'data' => $merged]; } $reportTables = findTablesWithKeyword($result->tables, 'Quarter'); if (!empty($reportTables)) { $merged = mergeTables($reportTables); echo "\nMerged " . count($reportTables) . " tables\n"; echo "Total rows: " . count($merged['data']) . "\n"; } ```