This commit is contained in:
154
docs/snippets/php/extraction/batch_processing.php
Normal file
154
docs/snippets/php/extraction/batch_processing.php
Normal file
@@ -0,0 +1,154 @@
|
||||
```php title="batch_processing.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Batch Document Processing
|
||||
*
|
||||
* Process multiple documents in parallel for maximum performance.
|
||||
* Kreuzberg's batch API uses multiple threads to extract documents concurrently.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
use function Kreuzberg\batch_extract_bytes;
|
||||
|
||||
$files = [
|
||||
'document1.pdf',
|
||||
'document2.docx',
|
||||
'document3.xlsx',
|
||||
'presentation.pptx',
|
||||
];
|
||||
|
||||
$files = array_filter($files, 'file_exists');
|
||||
|
||||
if (!empty($files)) {
|
||||
echo "Processing " . count($files) . " files in batch...\n\n";
|
||||
|
||||
$start = microtime(true);
|
||||
$results = batch_extract_files($files);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "Batch extraction completed in " . number_format($elapsed, 3) . " seconds\n";
|
||||
echo "Average: " . number_format($elapsed / count($files), 3) . " seconds per file\n\n";
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
$filename = basename($files[$index]);
|
||||
echo "$filename:\n";
|
||||
echo " Content: " . strlen($result->content) . " chars\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " MIME: " . $result->mimeType . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
|
||||
$pdfFiles = glob('*.pdf');
|
||||
if (!empty($pdfFiles)) {
|
||||
echo "Processing " . count($pdfFiles) . " PDF files...\n";
|
||||
|
||||
$start = microtime(true);
|
||||
$results = $kreuzberg->batchExtractFiles($pdfFiles, $config);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "Completed in " . number_format($elapsed, 2) . " seconds\n";
|
||||
echo "Throughput: " . number_format(count($pdfFiles) / $elapsed, 2) . " files/second\n\n";
|
||||
|
||||
$totalChars = 0;
|
||||
$totalTables = 0;
|
||||
|
||||
foreach ($results as $result) {
|
||||
$totalChars += strlen($result->content);
|
||||
$totalTables += count($result->tables);
|
||||
}
|
||||
|
||||
echo "Total content: " . number_format($totalChars) . " characters\n";
|
||||
echo "Total tables: $totalTables\n";
|
||||
}
|
||||
|
||||
$uploadedFiles = [
|
||||
['data' => file_get_contents('file1.pdf'), 'mime' => 'application/pdf'],
|
||||
['data' => file_get_contents('file2.docx'), 'mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
|
||||
];
|
||||
|
||||
$dataList = array_column($uploadedFiles, 'data');
|
||||
$mimeTypes = array_column($uploadedFiles, 'mime');
|
||||
|
||||
$results = batch_extract_bytes($dataList, $mimeTypes);
|
||||
|
||||
echo "\nProcessed " . count($results) . " files from memory\n";
|
||||
|
||||
function processDirectory(string $dir, Kreuzberg $kreuzberg): array
|
||||
{
|
||||
$results = [];
|
||||
$iterator = new RecursiveIteratorIterator(
|
||||
new RecursiveDirectoryIterator($dir)
|
||||
);
|
||||
|
||||
$files = [];
|
||||
foreach ($iterator as $file) {
|
||||
if ($file->isFile()) {
|
||||
$ext = strtolower($file->getExtension());
|
||||
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx', 'txt'], true)) {
|
||||
$files[] = $file->getPathname();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($files)) {
|
||||
return $results;
|
||||
}
|
||||
|
||||
$batches = array_chunk($files, 10);
|
||||
|
||||
foreach ($batches as $batchIndex => $batch) {
|
||||
echo "Processing batch " . ($batchIndex + 1) . "/" . count($batches) . "...\n";
|
||||
$batchResults = $kreuzberg->batchExtractFiles($batch);
|
||||
$results = array_merge($results, $batchResults);
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
$directory = './documents';
|
||||
if (is_dir($directory)) {
|
||||
echo "\nProcessing directory: $directory\n";
|
||||
$results = processDirectory($directory, $kreuzberg);
|
||||
echo "Processed " . count($results) . " files\n";
|
||||
}
|
||||
|
||||
$mixedFiles = ['valid.pdf', 'nonexistent.pdf', 'another.docx'];
|
||||
|
||||
try {
|
||||
$results = batch_extract_files($mixedFiles);
|
||||
} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
|
||||
echo "Batch processing error: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
$allFiles = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
|
||||
$batchSize = 5;
|
||||
$batches = array_chunk($allFiles, $batchSize);
|
||||
$totalProcessed = 0;
|
||||
|
||||
echo "\nProcessing " . count($allFiles) . " files in " . count($batches) . " batches...\n";
|
||||
|
||||
foreach ($batches as $index => $batch) {
|
||||
$progress = (($index + 1) / count($batches)) * 100;
|
||||
echo sprintf("\rProgress: %.1f%% [%d/%d batches]",
|
||||
$progress, $index + 1, count($batches));
|
||||
|
||||
$results = $kreuzberg->batchExtractFiles($batch);
|
||||
$totalProcessed += count($results);
|
||||
}
|
||||
|
||||
echo "\n\nCompleted! Processed $totalProcessed files.\n";
|
||||
```
|
||||
118
docs/snippets/php/extraction/docx_extraction.php
Normal file
118
docs/snippets/php/extraction/docx_extraction.php
Normal file
@@ -0,0 +1,118 @@
|
||||
```php title="docx_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* DOCX (Word) Document Extraction
|
||||
*
|
||||
* Extract text, tables, and metadata from Microsoft Word documents.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.docx');
|
||||
|
||||
echo "Word Document Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Document Metadata:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "Subject: " . ($result->metadata->subject ?? 'N/A') . "\n";
|
||||
echo "Keywords: " . implode(', ', $result->metadata->keywords ?? []) . "\n\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('report.docx');
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . ":\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
echo implode(' | ', $row) . "\n";
|
||||
if ($rowIndex === 0) {
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$conversions = [
|
||||
'plain' => null,
|
||||
'markdown' => 'markdown',
|
||||
];
|
||||
|
||||
foreach ($conversions as $name => $format) {
|
||||
$config = new ExtractionConfig(
|
||||
outputFormat: $format,
|
||||
preserveFormatting: $format !== null
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.docx');
|
||||
|
||||
$outputFile = "output_$name.txt";
|
||||
file_put_contents($outputFile, $result->content);
|
||||
echo "Saved $name format to: $outputFile\n";
|
||||
}
|
||||
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
$docxFiles = glob('*.docx');
|
||||
if (!empty($docxFiles)) {
|
||||
echo "\nBatch processing " . count($docxFiles) . " DOCX files...\n";
|
||||
|
||||
$results = batch_extract_files($docxFiles);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
$filename = basename($docxFiles[$index]);
|
||||
echo "\n$filename:\n";
|
||||
echo " Characters: " . strlen($result->content) . "\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown') . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
$result = extract_file('reviewed_document.docx');
|
||||
|
||||
if (!empty($result->metadata->createdBy)) {
|
||||
echo "\nDocument Information:\n";
|
||||
echo "Created by: " . $result->metadata->createdBy . "\n";
|
||||
}
|
||||
|
||||
if (!empty($result->metadata->producer)) {
|
||||
echo "Producer: " . $result->metadata->producer . "\n";
|
||||
}
|
||||
|
||||
$result = extract_file('document.docx');
|
||||
$content = $result->content;
|
||||
|
||||
$stats = [
|
||||
'characters' => mb_strlen($content),
|
||||
'words' => str_word_count($content),
|
||||
'lines' => substr_count($content, "\n"),
|
||||
'paragraphs' => substr_count($content, "\n\n"),
|
||||
'sentences' => preg_match_all('/[.!?]+/', $content),
|
||||
];
|
||||
|
||||
echo "\nDocument Statistics:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($stats as $metric => $value) {
|
||||
echo ucfirst($metric) . ": " . number_format($value) . "\n";
|
||||
}
|
||||
```
|
||||
288
docs/snippets/php/extraction/excel_extraction.php
Normal file
288
docs/snippets/php/extraction/excel_extraction.php
Normal file
@@ -0,0 +1,288 @@
|
||||
```php title="excel_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Excel Spreadsheet Extraction
|
||||
*
|
||||
* This example demonstrates extracting content from Excel files (.xlsx, .xls).
|
||||
* Excel spreadsheets are automatically converted to tables and text.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
|
||||
echo "Example 1: Basic Excel Extraction\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('financial_report.xlsx');
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Metadata:\n";
|
||||
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "- Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n\n";
|
||||
|
||||
echo "Example 2: Extract Excel Tables\n";
|
||||
echo "===============================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('data.xlsx');
|
||||
|
||||
if (count($result2->tables) > 0) {
|
||||
echo "Found " . count($result2->tables) . " table(s)\n\n";
|
||||
|
||||
foreach ($result2->tables as $i => $table) {
|
||||
echo "Table " . ($i + 1) . " (Sheet/Page {$table->pageNumber}):\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
echo "Raw data:\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 3: Convert Excel to CSV\n";
|
||||
echo "===============================\n";
|
||||
|
||||
$result3 = $kreuzberg->extractFile('spreadsheet.xlsx');
|
||||
|
||||
foreach ($result3->tables as $i => $table) {
|
||||
$csvFilename = "sheet_{$i}.csv";
|
||||
$fp = fopen($csvFilename, 'w');
|
||||
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
echo "Saved: {$csvFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 4: Convert Excel to JSON\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result4 = $kreuzberg->extractFile('data.xlsx');
|
||||
|
||||
foreach ($result4->tables as $i => $table) {
|
||||
$jsonData = [];
|
||||
|
||||
if (count($table->cells) > 0) {
|
||||
$headers = $table->cells[0];
|
||||
|
||||
for ($j = 1; $j < count($table->cells); $j++) {
|
||||
$row = $table->cells[$j];
|
||||
$rowData = [];
|
||||
|
||||
for ($k = 0; $k < count($headers); $k++) {
|
||||
$header = $headers[$k];
|
||||
$value = $row[$k] ?? '';
|
||||
$rowData[$header] = $value;
|
||||
}
|
||||
|
||||
$jsonData[] = $rowData;
|
||||
}
|
||||
}
|
||||
|
||||
$jsonFilename = "sheet_{$i}.json";
|
||||
file_put_contents($jsonFilename, json_encode($jsonData, JSON_PRETTY_PRINT));
|
||||
echo "Saved: {$jsonFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 5: Process Multiple Sheets\n";
|
||||
echo "==================================\n";
|
||||
|
||||
$result5 = $kreuzberg->extractFile('multi_sheet_workbook.xlsx');
|
||||
|
||||
echo "Total sheets/tables: " . count($result5->tables) . "\n\n";
|
||||
|
||||
foreach ($result5->tables as $i => $table) {
|
||||
echo "Sheet " . ($i + 1) . ":\n";
|
||||
echo "- Rows: " . count($table->cells) . "\n";
|
||||
echo "- Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n";
|
||||
|
||||
if (count($table->cells) > 1) {
|
||||
$numericColumns = [];
|
||||
|
||||
for ($col = 0; $col < count($table->cells[0]); $col++) {
|
||||
$isNumeric = true;
|
||||
|
||||
for ($row = 1; $row < count($table->cells); $row++) {
|
||||
$value = $table->cells[$row][$col] ?? '';
|
||||
if (!is_numeric(trim($value)) && trim($value) !== '') {
|
||||
$isNumeric = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($isNumeric) {
|
||||
$numericColumns[] = $col;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($numericColumns)) {
|
||||
echo "- Numeric columns: " . count($numericColumns) . "\n";
|
||||
|
||||
$col = $numericColumns[0];
|
||||
$sum = 0;
|
||||
for ($row = 1; $row < count($table->cells); $row++) {
|
||||
$value = $table->cells[$row][$col] ?? '0';
|
||||
$sum += (float) $value;
|
||||
}
|
||||
|
||||
$columnName = $table->cells[0][$col] ?? "Column {$col}";
|
||||
echo "- Sum of '{$columnName}': {$sum}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
echo "Example 6: Extract Specific Data\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result6 = $kreuzberg->extractFile('budget.xlsx');
|
||||
|
||||
if (count($result6->tables) > 0) {
|
||||
$table = $result6->tables[0];
|
||||
|
||||
echo "Header row:\n";
|
||||
if (count($table->cells) > 0) {
|
||||
print_r($table->cells[0]);
|
||||
}
|
||||
|
||||
echo "\nFirst data row:\n";
|
||||
if (count($table->cells) > 1) {
|
||||
print_r($table->cells[1]);
|
||||
}
|
||||
|
||||
if (count($table->cells) > 1 && count($table->cells[1]) > 2) {
|
||||
$cellValue = $table->cells[1][2];
|
||||
echo "\nCell [1][2]: {$cellValue}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 7: Batch Process Excel Files\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$excelFiles = [
|
||||
'january_sales.xlsx',
|
||||
'february_sales.xlsx',
|
||||
'march_sales.xlsx',
|
||||
];
|
||||
|
||||
$results = $kreuzberg->batchExtractFiles($excelFiles);
|
||||
|
||||
$totalSheets = 0;
|
||||
foreach ($results as $i => $result) {
|
||||
$sheetCount = count($result->tables);
|
||||
$totalSheets += $sheetCount;
|
||||
|
||||
echo "{$excelFiles[$i]}:\n";
|
||||
echo "- Sheets: {$sheetCount}\n";
|
||||
echo "- Text length: " . strlen($result->content) . " characters\n\n";
|
||||
}
|
||||
|
||||
echo "Total sheets across all files: {$totalSheets}\n\n";
|
||||
|
||||
echo "Example 8: Convert Excel to HTML\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result8 = $kreuzberg->extractFile('report.xlsx');
|
||||
|
||||
foreach ($result8->tables as $i => $table) {
|
||||
$html = "<table border='1'>\n";
|
||||
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
$html .= " <tr>\n";
|
||||
|
||||
$tag = $rowIndex === 0 ? 'th' : 'td';
|
||||
|
||||
foreach ($row as $cell) {
|
||||
$escapedCell = htmlspecialchars($cell);
|
||||
$html .= " <{$tag}>{$escapedCell}</{$tag}>\n";
|
||||
}
|
||||
|
||||
$html .= " </tr>\n";
|
||||
}
|
||||
|
||||
$html .= "</table>\n";
|
||||
|
||||
$htmlFilename = "sheet_{$i}.html";
|
||||
file_put_contents($htmlFilename, $html);
|
||||
echo "Saved: {$htmlFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 9: Excel Metadata Extraction\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$result9 = $kreuzberg->extractFile('workbook.xlsx');
|
||||
|
||||
echo "File Metadata:\n";
|
||||
echo "- Title: " . ($result9->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Subject: " . ($result9->metadata->subject ?? 'N/A') . "\n";
|
||||
echo "- Authors: " . (isset($result9->metadata->authors) ? implode(', ', $result9->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Created: " . ($result9->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "- Modified: " . ($result9->metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "- Created By: " . ($result9->metadata->createdBy ?? 'N/A') . "\n";
|
||||
echo "- Keywords: " . (isset($result9->metadata->keywords) ? implode(', ', $result9->metadata->keywords) : 'N/A') . "\n";
|
||||
|
||||
if (!empty($result9->metadata->custom)) {
|
||||
echo "\nCustom Properties:\n";
|
||||
foreach ($result9->metadata->custom as $key => $value) {
|
||||
echo "- {$key}: {$value}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 10: Error Handling\n";
|
||||
echo "=========================\n";
|
||||
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
|
||||
try {
|
||||
$result = $kreuzberg->extractFile('protected.xlsx');
|
||||
echo "Success: Extracted " . count($result->tables) . " sheets\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Error: {$e->getMessage()}\n";
|
||||
echo "Note: Password-protected files may require special handling\n";
|
||||
}
|
||||
|
||||
echo "\n\nSupported Excel Formats:\n";
|
||||
echo "========================\n";
|
||||
echo "- .xlsx (Office Open XML)\n";
|
||||
echo "- .xls (Legacy Excel format)\n";
|
||||
echo "- .xlsm (Macro-enabled)\n";
|
||||
echo "- .xlsb (Binary workbook)\n";
|
||||
echo "- .xltx (Template)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Excel tables are automatically detected as Table objects\n";
|
||||
echo "2. Each sheet becomes a separate table\n";
|
||||
echo "3. Use table->cells for programmatic access to cell data\n";
|
||||
echo "4. Use table->markdown for human-readable output\n";
|
||||
echo "5. First row is often headers - handle accordingly\n";
|
||||
echo "6. Check for numeric columns to perform calculations\n";
|
||||
echo "7. Export to CSV/JSON for database import\n";
|
||||
echo "8. Use batch processing for multiple Excel files\n";
|
||||
```
|
||||
159
docs/snippets/php/extraction/image_extraction.php
Normal file
159
docs/snippets/php/extraction/image_extraction.php
Normal file
@@ -0,0 +1,159 @@
|
||||
```php title="image_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Extraction from Documents
|
||||
*
|
||||
* Extract embedded images from PDFs, Office documents, and other formats.
|
||||
* Optionally perform OCR on extracted images.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100
|
||||
),
|
||||
extractImages: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('presentation.pptx');
|
||||
|
||||
echo "Image Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Images found: " . count($result->images ?? []) . "\n\n";
|
||||
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = sprintf(
|
||||
'extracted_p%d_i%d_%dx%d.%s',
|
||||
$image->pageNumber,
|
||||
$image->imageIndex,
|
||||
$image->width,
|
||||
$image->height,
|
||||
$image->format
|
||||
);
|
||||
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename\n";
|
||||
echo " Size: {$image->width}x{$image->height} pixels\n";
|
||||
echo " Format: {$image->format}\n";
|
||||
echo " Data: " . number_format(strlen($image->data)) . " bytes\n\n";
|
||||
}
|
||||
|
||||
$ocrConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
performOcr: true,
|
||||
minWidth: 200,
|
||||
minHeight: 100
|
||||
),
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($ocrConfig);
|
||||
$result = $kreuzberg->extractFile('scanned_images.pdf');
|
||||
|
||||
echo "Images with OCR:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
echo "Image {$image->imageIndex} from page {$image->pageNumber}:\n";
|
||||
|
||||
if ($image->ocrResult !== null) {
|
||||
echo " OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
|
||||
echo " OCR Length: " . strlen($image->ocrResult->content) . " chars\n";
|
||||
} else {
|
||||
echo " No OCR result\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$largeImageConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 500,
|
||||
minHeight: 500
|
||||
),
|
||||
extractImages: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($largeImageConfig);
|
||||
$result = $kreuzberg->extractFile('photo_album.pdf');
|
||||
|
||||
echo "Large images (>500x500):\n";
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = "large_image_{$image->imageIndex}.{$image->format}";
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename ({$image->width}x{$image->height})\n";
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
$imageTypes = [];
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
if (!isset($imageTypes[$image->format])) {
|
||||
$imageTypes[$image->format] = [];
|
||||
}
|
||||
$imageTypes[$image->format][] = $image;
|
||||
}
|
||||
|
||||
echo "\nImages by format:\n";
|
||||
foreach ($imageTypes as $format => $images) {
|
||||
echo " $format: " . count($images) . " images\n";
|
||||
|
||||
$dir = "images_$format";
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0755, true);
|
||||
}
|
||||
|
||||
foreach ($images as $index => $image) {
|
||||
$filename = "$dir/image_$index.$format";
|
||||
file_put_contents($filename, $image->data);
|
||||
}
|
||||
echo " Saved to: $dir/\n";
|
||||
}
|
||||
|
||||
if (extension_loaded('gd')) {
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
if ($image->format === 'png' || $image->format === 'jpg') {
|
||||
$gdImage = imagecreatefromstring($image->data);
|
||||
|
||||
if ($gdImage !== false) {
|
||||
$width = imagesx($gdImage);
|
||||
$height = imagesy($gdImage);
|
||||
$thumbWidth = 200;
|
||||
$thumbHeight = (int)(($height / $width) * $thumbWidth);
|
||||
|
||||
$thumb = imagecreatetruecolor($thumbWidth, $thumbHeight);
|
||||
imagecopyresampled($thumb, $gdImage, 0, 0, 0, 0,
|
||||
$thumbWidth, $thumbHeight, $width, $height);
|
||||
|
||||
$thumbFile = "thumb_{$image->imageIndex}.{$image->format}";
|
||||
if ($image->format === 'png') {
|
||||
imagepng($thumb, $thumbFile);
|
||||
} else {
|
||||
imagejpeg($thumb, $thumbFile, 85);
|
||||
}
|
||||
|
||||
echo "Created thumbnail: $thumbFile\n";
|
||||
|
||||
imagedestroy($gdImage);
|
||||
imagedestroy($thumb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
196
docs/snippets/php/extraction/metadata_extraction.php
Normal file
196
docs/snippets/php/extraction/metadata_extraction.php
Normal file
@@ -0,0 +1,196 @@
|
||||
```php title="metadata_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Metadata Extraction
|
||||
*
|
||||
* Extract and process document metadata including title, author,
|
||||
* creation date, keywords, and custom properties.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
$metadata = $result->metadata;
|
||||
|
||||
echo "Document Metadata:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Title: " . ($metadata->title ?? 'N/A') . "\n";
|
||||
echo "Authors: " . (isset($metadata->authors) ? implode(', ', $metadata->authors) : 'N/A') . "\n";
|
||||
echo "Subject: " . ($metadata->subject ?? 'N/A') . "\n";
|
||||
echo "Created By: " . ($metadata->createdBy ?? 'N/A') . "\n";
|
||||
echo "Producer: " . ($metadata->producer ?? 'N/A') . "\n";
|
||||
echo "Created: " . ($metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "Modified: " . ($metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "Page Count: " . ($metadata->pageCount ?? 'N/A') . "\n";
|
||||
echo "Keywords: " . implode(', ', $metadata->keywords ?? []) . "\n";
|
||||
echo "Language: " . ($metadata->language ?? 'N/A') . "\n\n";
|
||||
|
||||
$files = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
|
||||
$metadataCollection = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
$result = extract_file($file);
|
||||
$metadataCollection[] = [
|
||||
'file' => basename($file),
|
||||
'title' => $result->metadata->title ?? 'Untitled',
|
||||
'author' => isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown',
|
||||
'created' => $result->metadata->createdAt ?? 'Unknown',
|
||||
'pages' => $result->metadata->pageCount ?? 0,
|
||||
'size' => filesize($file),
|
||||
];
|
||||
}
|
||||
|
||||
echo "Metadata Collection:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($metadataCollection as $meta) {
|
||||
echo "{$meta['file']}:\n";
|
||||
echo " Title: {$meta['title']}\n";
|
||||
echo " Author: {$meta['author']}\n";
|
||||
echo " Created: {$meta['created']}\n";
|
||||
echo " Pages: {$meta['pages']}\n";
|
||||
echo " Size: " . number_format($meta['size'] / 1024, 2) . " KB\n\n";
|
||||
}
|
||||
|
||||
function searchByAuthor(array $collection, string $author): array
|
||||
{
|
||||
return array_filter($collection, function ($meta) use ($author) {
|
||||
return stripos($meta['author'], $author) !== false;
|
||||
});
|
||||
}
|
||||
|
||||
function searchByDateRange(array $collection, string $start, string $end): array
|
||||
{
|
||||
return array_filter($collection, function ($meta) use ($start, $end) {
|
||||
$created = $meta['created'];
|
||||
if ($created === 'Unknown') {
|
||||
return false;
|
||||
}
|
||||
$dateOnly = substr($created, 0, 10);
|
||||
return $dateOnly >= $start && $dateOnly <= $end;
|
||||
});
|
||||
}
|
||||
|
||||
$johnDocs = searchByAuthor($metadataCollection, 'John');
|
||||
echo "Documents by John: " . count($johnDocs) . "\n";
|
||||
|
||||
$recentDocs = searchByDateRange($metadataCollection, '2024-01-01', '2024-12-31');
|
||||
echo "Documents from 2024: " . count($recentDocs) . "\n\n";
|
||||
|
||||
function generateCatalog(array $collection): string
|
||||
{
|
||||
$html = "<html><head><title>Document Catalog</title></head><body>\n";
|
||||
$html .= "<h1>Document Catalog</h1>\n";
|
||||
$html .= "<table border='1'>\n";
|
||||
$html .= "<tr><th>File</th><th>Title</th><th>Author</th><th>Created</th><th>Pages</th></tr>\n";
|
||||
|
||||
foreach ($collection as $meta) {
|
||||
$html .= "<tr>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['file']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['title']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['author']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['created']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars((string)$meta['pages']) . "</td>";
|
||||
$html .= "</tr>\n";
|
||||
}
|
||||
|
||||
$html .= "</table>\n</body></html>";
|
||||
return $html;
|
||||
}
|
||||
|
||||
$catalog = generateCatalog($metadataCollection);
|
||||
file_put_contents('catalog.html', $catalog);
|
||||
echo "Catalog saved to: catalog.html\n";
|
||||
|
||||
function exportMetadataToCSV(array $collection, string $filename): void
|
||||
{
|
||||
$fp = fopen($filename, 'w');
|
||||
|
||||
fputcsv($fp, ['File', 'Title', 'Author', 'Created', 'Pages', 'Size (KB)']);
|
||||
|
||||
foreach ($collection as $meta) {
|
||||
fputcsv($fp, [
|
||||
$meta['file'],
|
||||
$meta['title'],
|
||||
$meta['author'],
|
||||
$meta['created'],
|
||||
$meta['pages'],
|
||||
number_format($meta['size'] / 1024, 2),
|
||||
]);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
}
|
||||
|
||||
exportMetadataToCSV($metadataCollection, 'metadata.csv');
|
||||
echo "Metadata exported to: metadata.csv\n";
|
||||
|
||||
$totalPages = array_sum(array_column($metadataCollection, 'pages'));
|
||||
$totalSize = array_sum(array_column($metadataCollection, 'size'));
|
||||
$authors = array_unique(array_column($metadataCollection, 'author'));
|
||||
|
||||
echo "\nCollection Statistics:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total documents: " . count($metadataCollection) . "\n";
|
||||
echo "Total pages: " . number_format($totalPages) . "\n";
|
||||
echo "Total size: " . number_format($totalSize / 1024 / 1024, 2) . " MB\n";
|
||||
echo "Unique authors: " . count($authors) . "\n";
|
||||
echo "Average pages per document: " . number_format($totalPages / count($metadataCollection), 1) . "\n";
|
||||
|
||||
$byAuthor = [];
|
||||
foreach ($metadataCollection as $meta) {
|
||||
$author = $meta['author'];
|
||||
if (!isset($byAuthor[$author])) {
|
||||
$byAuthor[$author] = [];
|
||||
}
|
||||
$byAuthor[$author][] = $meta;
|
||||
}
|
||||
|
||||
echo "\nDocuments by Author:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($byAuthor as $author => $docs) {
|
||||
echo "$author: " . count($docs) . " documents\n";
|
||||
}
|
||||
|
||||
function validateMetadata(array $meta): array
|
||||
{
|
||||
$issues = [];
|
||||
|
||||
if (empty($meta['title']) || $meta['title'] === 'Untitled') {
|
||||
$issues[] = 'Missing title';
|
||||
}
|
||||
|
||||
if (empty($meta['author']) || $meta['author'] === 'Unknown') {
|
||||
$issues[] = 'Missing author';
|
||||
}
|
||||
|
||||
if (empty($meta['created']) || $meta['created'] === 'Unknown') {
|
||||
$issues[] = 'Missing creation date';
|
||||
}
|
||||
|
||||
if ($meta['pages'] === 0) {
|
||||
$issues[] = 'Invalid page count';
|
||||
}
|
||||
|
||||
return $issues;
|
||||
}
|
||||
|
||||
echo "\nMetadata Quality Check:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$incomplete = 0;
|
||||
foreach ($metadataCollection as $meta) {
|
||||
$issues = validateMetadata($meta);
|
||||
if (!empty($issues)) {
|
||||
$incomplete++;
|
||||
echo "{$meta['file']}: " . implode(', ', $issues) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nIncomplete metadata: $incomplete/" . count($metadataCollection) . " documents\n";
|
||||
```
|
||||
282
docs/snippets/php/extraction/multi_format.php
Normal file
282
docs/snippets/php/extraction/multi_format.php
Normal file
@@ -0,0 +1,282 @@
|
||||
```php title="multi_format.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Multi-Format Document Extraction
|
||||
*
|
||||
* Handle various document formats (PDF, DOCX, XLSX, PPTX, images, etc.)
|
||||
* with format-specific processing and unified output.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
use function Kreuzberg\detect_mime_type_from_path;
|
||||
|
||||
$formats = [
|
||||
'PDF' => 'document.pdf',
|
||||
'Word' => 'document.docx',
|
||||
'Excel' => 'spreadsheet.xlsx',
|
||||
'PowerPoint' => 'presentation.pptx',
|
||||
'Text' => 'readme.txt',
|
||||
'HTML' => 'page.html',
|
||||
'Markdown' => 'guide.md',
|
||||
'Image' => 'scan.png',
|
||||
];
|
||||
|
||||
echo "Multi-Format Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
foreach ($formats as $type => $file) {
|
||||
if (!file_exists($file)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
echo "Processing $type ($file):\n";
|
||||
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
echo " MIME type: $mimeType\n";
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo " Content length: " . strlen($result->content) . " chars\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " Images: " . count($result->images ?? []) . "\n";
|
||||
echo " Pages: " . ($result->metadata->pageCount ?? 'N/A') . "\n";
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$mixedFiles = glob('documents/*.*');
|
||||
$byFormat = [];
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
$extension = pathinfo($file, PATHINFO_EXTENSION);
|
||||
|
||||
if (!isset($byFormat[$extension])) {
|
||||
$byFormat[$extension] = [];
|
||||
}
|
||||
|
||||
$result = extract_file($file);
|
||||
$byFormat[$extension][] = [
|
||||
'file' => basename($file),
|
||||
'mime' => $mimeType,
|
||||
'size' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
];
|
||||
}
|
||||
|
||||
echo "Files by Format:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($byFormat as $ext => $files) {
|
||||
echo strtoupper($ext) . ": " . count($files) . " files\n";
|
||||
|
||||
$totalSize = array_sum(array_column($files, 'size'));
|
||||
$totalTables = array_sum(array_column($files, 'tables'));
|
||||
|
||||
echo " Total content: " . number_format($totalSize) . " chars\n";
|
||||
echo " Total tables: $totalTables\n\n";
|
||||
}
|
||||
|
||||
$formatConfigs = [
|
||||
'pdf' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: true,
|
||||
pdf: new \Kreuzberg\Config\PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 85
|
||||
)
|
||||
),
|
||||
'docx' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
),
|
||||
'xlsx' => new ExtractionConfig(
|
||||
extractTables: true
|
||||
),
|
||||
'png' => new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
),
|
||||
];
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
||||
|
||||
if (!isset($formatConfigs[$ext])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$config = $formatConfigs[$ext];
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo "Processed " . basename($file) . " with $ext config\n";
|
||||
}
|
||||
|
||||
function convertToMarkdown(string $inputFile): string
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
preserveFormatting: true,
|
||||
outputFormat: 'markdown',
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($inputFile);
|
||||
|
||||
$markdown = "# " . ($result->metadata->title ?? basename($inputFile)) . "\n\n";
|
||||
|
||||
if (isset($result->metadata->authors)) {
|
||||
$markdown .= "_Authors: " . implode(', ', $result->metadata->authors) . "_\n\n";
|
||||
}
|
||||
|
||||
$markdown .= $result->content . "\n\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$markdown .= "## Table " . ($index + 1) . "\n\n";
|
||||
$markdown .= $table->markdown . "\n\n";
|
||||
}
|
||||
|
||||
return $markdown;
|
||||
}
|
||||
|
||||
echo "\nConverting to Markdown:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach (['document.pdf', 'document.docx'] as $file) {
|
||||
if (!file_exists($file)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$markdown = convertToMarkdown($file);
|
||||
$outputFile = pathinfo($file, PATHINFO_FILENAME) . '.md';
|
||||
|
||||
file_put_contents($outputFile, $markdown);
|
||||
echo "Converted: $file -> $outputFile\n";
|
||||
}
|
||||
|
||||
function extractFromArchive(string $archiveFile): array
|
||||
{
|
||||
$result = extract_file($archiveFile);
|
||||
|
||||
return [
|
||||
'archive' => basename($archiveFile),
|
||||
'listing' => $result->content,
|
||||
'mime' => $result->mimeType,
|
||||
];
|
||||
}
|
||||
|
||||
class UniversalExtractor
|
||||
{
|
||||
private Kreuzberg $kreuzberg;
|
||||
private array $formatHandlers = [];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->kreuzberg = new Kreuzberg();
|
||||
|
||||
$this->formatHandlers = [
|
||||
'application/pdf' => [$this, 'handlePDF'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => [$this, 'handleDOCX'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => [$this, 'handleXLSX'],
|
||||
'image/png' => [$this, 'handleImage'],
|
||||
'image/jpeg' => [$this, 'handleImage'],
|
||||
];
|
||||
}
|
||||
|
||||
public function extract(string $file): array
|
||||
{
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
$handler = $this->formatHandlers[$mimeType] ?? [$this, 'handleGeneric'];
|
||||
|
||||
return $handler($file, $mimeType);
|
||||
}
|
||||
|
||||
private function handlePDF(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(extractTables: true, extractImages: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'PDF',
|
||||
'content' => $result->content,
|
||||
'tables' => count($result->tables),
|
||||
'images' => count($result->images ?? []),
|
||||
'pages' => $result->metadata->pageCount,
|
||||
];
|
||||
}
|
||||
|
||||
private function handleDOCX(string $file, string $mimeType): array
|
||||
{
|
||||
$result = $this->kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Word Document',
|
||||
'content' => $result->content,
|
||||
'tables' => count($result->tables),
|
||||
'authors' => $result->metadata->authors,
|
||||
];
|
||||
}
|
||||
|
||||
private function handleXLSX(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(extractTables: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Excel Spreadsheet',
|
||||
'content' => $result->content,
|
||||
'sheets' => count($result->tables),
|
||||
];
|
||||
}
|
||||
|
||||
private function handleImage(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Image (OCR)',
|
||||
'content' => $result->content,
|
||||
'ocr_length' => strlen($result->content),
|
||||
];
|
||||
}
|
||||
|
||||
private function handleGeneric(string $file, string $mimeType): array
|
||||
{
|
||||
$result = $this->kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Generic',
|
||||
'mime' => $mimeType,
|
||||
'content' => $result->content,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$extractor = new UniversalExtractor();
|
||||
|
||||
echo "\nUniversal Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$data = $extractor->extract($file);
|
||||
echo basename($file) . " ({$data['type']}):\n";
|
||||
print_r(array_filter($data, fn($k) => $k !== 'content', ARRAY_FILTER_USE_KEY));
|
||||
echo "\n";
|
||||
}
|
||||
```
|
||||
114
docs/snippets/php/extraction/pdf_extraction.php
Normal file
114
docs/snippets/php/extraction/pdf_extraction.php
Normal file
@@ -0,0 +1,114 @@
|
||||
```php title="pdf_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PDF Document Extraction
|
||||
*
|
||||
* Extract text, tables, and images from PDF files with various configurations.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "PDF Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n";
|
||||
echo "Pages: " . ($result->metadata->pageCount ?? 'unknown') . "\n\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 85
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('report.pdf');
|
||||
|
||||
echo "Extracted Tables:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
|
||||
|
||||
echo "Markdown format:\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
$csvFile = "table_{$index}.csv";
|
||||
$fp = fopen($csvFile, 'w');
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
fclose($fp);
|
||||
echo "Saved to: $csvFile\n\n";
|
||||
}
|
||||
|
||||
echo "Extracted Images:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = sprintf(
|
||||
'page_%d_image_%d.%s',
|
||||
$image->pageNumber,
|
||||
$image->imageIndex,
|
||||
$image->format
|
||||
);
|
||||
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename\n";
|
||||
echo " Size: {$image->width}x{$image->height}\n";
|
||||
echo " Format: {$image->format}\n";
|
||||
echo " Data size: " . strlen($image->data) . " bytes\n\n";
|
||||
}
|
||||
|
||||
$formattedConfig = new ExtractionConfig(
|
||||
preserveFormatting: true,
|
||||
outputFormat: 'markdown'
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($formattedConfig);
|
||||
$result = $kreuzberg->extractFile('formatted.pdf');
|
||||
|
||||
file_put_contents('output.md', $result->content);
|
||||
echo "Saved formatted output to: output.md\n";
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
$content = $result->content;
|
||||
|
||||
$sections = [];
|
||||
$lines = explode("\n", $content);
|
||||
$currentSection = null;
|
||||
$currentContent = [];
|
||||
|
||||
foreach ($lines as $line) {
|
||||
if (preg_match('/^#+\s+(.+)$/', $line, $matches)) {
|
||||
if ($currentSection !== null) {
|
||||
$sections[$currentSection] = implode("\n", $currentContent);
|
||||
}
|
||||
$currentSection = $matches[1];
|
||||
$currentContent = [];
|
||||
} else {
|
||||
$currentContent[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
if ($currentSection !== null) {
|
||||
$sections[$currentSection] = implode("\n", $currentContent);
|
||||
}
|
||||
|
||||
echo "\nDocument sections:\n";
|
||||
foreach ($sections as $title => $content) {
|
||||
echo " - $title (" . strlen($content) . " chars)\n";
|
||||
}
|
||||
```
|
||||
195
docs/snippets/php/extraction/powerpoint_extraction.php
Normal file
195
docs/snippets/php/extraction/powerpoint_extraction.php
Normal file
@@ -0,0 +1,195 @@
|
||||
```php title="powerpoint_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PowerPoint Presentation Extraction
|
||||
*
|
||||
* This example demonstrates extracting content from PowerPoint files (.pptx, .ppt),
|
||||
* including text, notes, images, and tables from slides.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
use Kreuzberg\Config\PageConfig;
|
||||
|
||||
echo "Example 1: Basic PowerPoint Extraction\n";
|
||||
echo "======================================\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('presentation.pptx');
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Metadata:\n";
|
||||
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Slide Count: " . ($result->metadata->pageCount ?? 'N/A') . "\n\n";
|
||||
|
||||
echo "Example 2: Extract Per-Slide Content\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: '--- Slide {page_number} ---'
|
||||
)
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('presentation.pptx');
|
||||
|
||||
if ($result2->pages !== null) {
|
||||
echo "Total slides: " . count($result2->pages) . "\n\n";
|
||||
|
||||
foreach ($result2->pages as $page) {
|
||||
echo "Slide {$page->pageNumber}:\n";
|
||||
echo "- Text length: " . strlen($page->content) . " characters\n";
|
||||
echo "- Tables: " . count($page->tables) . "\n";
|
||||
echo "- Images: " . count($page->images) . "\n";
|
||||
echo "- Content preview: " . substr($page->content, 0, 100) . "...\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 3: Extract Images from Slides\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100
|
||||
)
|
||||
);
|
||||
|
||||
$result3 = (new Kreuzberg($config3))->extractFile('presentation.pptx');
|
||||
|
||||
if ($result3->images !== null) {
|
||||
echo "Total images: " . count($result3->images) . "\n\n";
|
||||
|
||||
foreach ($result3->images as $i => $image) {
|
||||
echo "Image {$i}:\n";
|
||||
echo "- Format: {$image->format}\n";
|
||||
echo "- Size: {$image->width}x{$image->height}\n";
|
||||
echo "- Slide: {$image->pageNumber}\n";
|
||||
|
||||
$filename = "slide_{$image->pageNumber}_image_{$i}.{$image->format}";
|
||||
file_put_contents($filename, base64_decode($image->data));
|
||||
echo "- Saved: {$filename}\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 4: Extract Tables from Slides\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$result4 = (new Kreuzberg($config4))->extractFile('data_presentation.pptx');
|
||||
|
||||
if (count($result4->tables) > 0) {
|
||||
echo "Found " . count($result4->tables) . " table(s)\n\n";
|
||||
|
||||
foreach ($result4->tables as $i => $table) {
|
||||
echo "Table " . ($i + 1) . " (Slide {$table->pageNumber}):\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 5: Convert PowerPoint to Markdown\n";
|
||||
echo "=========================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: '---\n\n## Slide {page_number}\n\n'
|
||||
),
|
||||
outputFormat: 'markdown'
|
||||
);
|
||||
|
||||
$result5 = (new Kreuzberg($config5))->extractFile('presentation.pptx');
|
||||
|
||||
$markdownContent = $result5->content;
|
||||
file_put_contents('presentation.md', $markdownContent);
|
||||
|
||||
echo "Converted to Markdown\n";
|
||||
echo "Saved as: presentation.md\n";
|
||||
echo "Content preview:\n";
|
||||
echo substr($markdownContent, 0, 500) . "...\n\n";
|
||||
|
||||
echo "Example 6: Generate Presentation Summary\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
page: new PageConfig(extractPages: true)
|
||||
);
|
||||
|
||||
$result6 = (new Kreuzberg($config6))->extractFile('meeting_deck.pptx');
|
||||
|
||||
echo "Presentation Summary:\n";
|
||||
echo "====================\n";
|
||||
echo "Title: " . ($result6->metadata->title ?? 'Untitled') . "\n";
|
||||
echo "Author: " . (isset($result6->metadata->authors) ? implode(', ', $result6->metadata->authors) : 'Unknown') . "\n";
|
||||
echo "Total Slides: " . ($result6->metadata->pageCount ?? count($result6->pages ?? [])) . "\n";
|
||||
echo "Total Text: " . strlen($result6->content) . " characters\n";
|
||||
echo "Tables: " . count($result6->tables) . "\n";
|
||||
|
||||
if ($result6->pages !== null) {
|
||||
echo "\nSlide Breakdown:\n";
|
||||
foreach ($result6->pages as $page) {
|
||||
$wordCount = str_word_count($page->content);
|
||||
echo "- Slide {$page->pageNumber}: {$wordCount} words, " . count($page->tables) . " tables\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 7: Search Content in Slides\n";
|
||||
echo "===================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
page: new PageConfig(extractPages: true)
|
||||
);
|
||||
|
||||
$result7 = (new Kreuzberg($config7))->extractFile('presentation.pptx');
|
||||
|
||||
$searchTerm = "revenue";
|
||||
|
||||
if ($result7->pages !== null) {
|
||||
echo "Searching for '{$searchTerm}':\n\n";
|
||||
|
||||
foreach ($result7->pages as $page) {
|
||||
if (stripos($page->content, $searchTerm) !== false) {
|
||||
echo "Found in Slide {$page->pageNumber}:\n";
|
||||
|
||||
$pos = stripos($page->content, $searchTerm);
|
||||
$context = substr($page->content, max(0, $pos - 50), 150);
|
||||
echo "- Context: ...{$context}...\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nSupported PowerPoint Formats:\n";
|
||||
echo "=============================\n";
|
||||
echo "- .pptx (PowerPoint 2007+)\n";
|
||||
echo "- .ppt (PowerPoint 97-2003)\n";
|
||||
echo "- .pptm (Macro-enabled)\n";
|
||||
echo "- .potx (Template)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Use page extraction to process individual slides\n";
|
||||
echo "2. Extract images for visual content analysis\n";
|
||||
echo "3. Extract tables for data analysis\n";
|
||||
echo "4. Use metadata for presentation information\n";
|
||||
echo "5. Convert to Markdown for documentation\n";
|
||||
echo "6. Search across slides for specific content\n";
|
||||
echo "7. Generate summaries for presentation overviews\n";
|
||||
```
|
||||
217
docs/snippets/php/extraction/table_extraction.php
Normal file
217
docs/snippets/php/extraction/table_extraction.php
Normal file
@@ -0,0 +1,217 @@
|
||||
```php title="table_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Table Extraction and Processing
|
||||
*
|
||||
* Extract tables from PDFs and other documents, process them,
|
||||
* and export to various formats (CSV, JSON, HTML).
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(extractTables: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('financial_report.pdf');
|
||||
|
||||
echo "Table Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
|
||||
echo "Markdown:\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
echo "Array format:\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
|
||||
|
||||
echo "HTML:\n";
|
||||
echo "<table>\n";
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
$tag = $rowIndex === 0 ? 'th' : 'td';
|
||||
echo " <tr>\n";
|
||||
foreach ($row as $cell) {
|
||||
echo " <$tag>" . htmlspecialchars($cell) . "</$tag>\n";
|
||||
}
|
||||
echo " </tr>\n";
|
||||
}
|
||||
echo "</table>\n\n";
|
||||
}
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$filename = "table_" . ($index + 1) . "_page_" . $table->pageNumber . ".csv";
|
||||
$fp = fopen($filename, 'w');
|
||||
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
echo "Exported to: $filename\n";
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
$ocrConfig = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
enableTableDetection: true,
|
||||
psm: 6
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($ocrConfig);
|
||||
$result = $kreuzberg->extractFile('scanned_table.pdf');
|
||||
|
||||
echo "OCR Table Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables with OCR: " . count($result->tables) . "\n\n";
|
||||
|
||||
function processTable(array $cells): array
|
||||
{
|
||||
$processed = [];
|
||||
|
||||
$headers = array_shift($cells);
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$rowData = [];
|
||||
foreach ($headers as $index => $header) {
|
||||
$rowData[$header] = $row[$index] ?? '';
|
||||
}
|
||||
$processed[] = $rowData;
|
||||
}
|
||||
|
||||
return $processed;
|
||||
}
|
||||
|
||||
foreach ($result->tables as $table) {
|
||||
$structured = processTable($table->cells);
|
||||
|
||||
echo "Structured table data:\n";
|
||||
echo json_encode($structured, JSON_PRETTY_PRINT) . "\n\n";
|
||||
}
|
||||
|
||||
function findTablesWithKeyword(array $tables, string $keyword): array
|
||||
{
|
||||
$matching = [];
|
||||
|
||||
foreach ($tables as $table) {
|
||||
foreach ($table->cells as $row) {
|
||||
foreach ($row as $cell) {
|
||||
if (stripos($cell, $keyword) !== false) {
|
||||
$matching[] = $table;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $matching;
|
||||
}
|
||||
|
||||
$salesTables = findTablesWithKeyword($result->tables, 'sales');
|
||||
echo "Tables containing 'sales': " . count($salesTables) . "\n";
|
||||
|
||||
function tableToAssociativeArray(\Kreuzberg\Types\Table $table): array
|
||||
{
|
||||
$cells = $table->cells;
|
||||
if (empty($cells)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$headers = array_shift($cells);
|
||||
$result = [];
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$rowData = [];
|
||||
foreach ($headers as $index => $header) {
|
||||
$rowData[$header] = $row[$index] ?? null;
|
||||
}
|
||||
$result[] = $rowData;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile('quarterly_report.pdf');
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$data = tableToAssociativeArray($table);
|
||||
|
||||
echo "\nTable " . ($index + 1) . " data:\n";
|
||||
|
||||
$totals = [];
|
||||
foreach ($data as $row) {
|
||||
foreach ($row as $key => $value) {
|
||||
if (is_numeric($value)) {
|
||||
if (!isset($totals[$key])) {
|
||||
$totals[$key] = 0;
|
||||
}
|
||||
$totals[$key] += floatval($value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($totals)) {
|
||||
echo "Column totals:\n";
|
||||
foreach ($totals as $column => $total) {
|
||||
echo " $column: " . number_format($total, 2) . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$allTablesJson = array_map(function ($table) {
|
||||
return [
|
||||
'page' => $table->pageNumber,
|
||||
'rows' => count($table->cells),
|
||||
'columns' => count($table->cells[0] ?? []),
|
||||
'data' => tableToAssociativeArray($table),
|
||||
'markdown' => $table->markdown,
|
||||
];
|
||||
}, $result->tables);
|
||||
|
||||
file_put_contents('tables.json', json_encode($allTablesJson, JSON_PRETTY_PRINT));
|
||||
echo "\nAll tables exported to: tables.json\n";
|
||||
|
||||
function mergeTables(array $tables): array
|
||||
{
|
||||
if (empty($tables)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$merged = [];
|
||||
$headers = $tables[0]->cells[0] ?? [];
|
||||
|
||||
foreach ($tables as $table) {
|
||||
$cells = $table->cells;
|
||||
array_shift($cells);
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$merged[] = $row;
|
||||
}
|
||||
}
|
||||
|
||||
return ['headers' => $headers, 'data' => $merged];
|
||||
}
|
||||
|
||||
$reportTables = findTablesWithKeyword($result->tables, 'Quarter');
|
||||
if (!empty($reportTables)) {
|
||||
$merged = mergeTables($reportTables);
|
||||
echo "\nMerged " . count($reportTables) . " tables\n";
|
||||
echo "Total rows: " . count($merged['data']) . "\n";
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user