Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/extraction/multi_format.php
+++ b/docs/snippets/php/extraction/multi_format.php
@@ -0,0 +1,282 @@
+```php title="multi_format.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Multi-Format Document Extraction
+ *
+ * Handle various document formats (PDF, DOCX, XLSX, PPTX, images, etc.)
+ * with format-specific processing and unified output.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use function Kreuzberg\extract_file;
+use function Kreuzberg\detect_mime_type_from_path;
+
+$formats = [
+    'PDF' => 'document.pdf',
+    'Word' => 'document.docx',
+    'Excel' => 'spreadsheet.xlsx',
+    'PowerPoint' => 'presentation.pptx',
+    'Text' => 'readme.txt',
+    'HTML' => 'page.html',
+    'Markdown' => 'guide.md',
+    'Image' => 'scan.png',
+];
+
+echo "Multi-Format Extraction:\n";
+echo str_repeat('=', 60) . "\n\n";
+
+$kreuzberg = new Kreuzberg();
+
+foreach ($formats as $type => $file) {
+    if (!file_exists($file)) {
+        continue;
+    }
+
+    echo "Processing $type ($file):\n";
+
+    $mimeType = detect_mime_type_from_path($file);
+    echo "  MIME type: $mimeType\n";
+
+    $result = $kreuzberg->extractFile($file);
+
+    echo "  Content length: " . strlen($result->content) . " chars\n";
+    echo "  Tables: " . count($result->tables) . "\n";
+    echo "  Images: " . count($result->images ?? []) . "\n";
+    echo "  Pages: " . ($result->metadata->pageCount ?? 'N/A') . "\n";
+    echo "\n";
+}
+
+$mixedFiles = glob('documents/*.*');
+$byFormat = [];
+
+foreach ($mixedFiles as $file) {
+    $mimeType = detect_mime_type_from_path($file);
+    $extension = pathinfo($file, PATHINFO_EXTENSION);
+
+    if (!isset($byFormat[$extension])) {
+        $byFormat[$extension] = [];
+    }
+
+    $result = extract_file($file);
+    $byFormat[$extension][] = [
+        'file' => basename($file),
+        'mime' => $mimeType,
+        'size' => strlen($result->content),
+        'tables' => count($result->tables),
+    ];
+}
+
+echo "Files by Format:\n";
+echo str_repeat('=', 60) . "\n";
+foreach ($byFormat as $ext => $files) {
+    echo strtoupper($ext) . ": " . count($files) . " files\n";
+
+    $totalSize = array_sum(array_column($files, 'size'));
+    $totalTables = array_sum(array_column($files, 'tables'));
+
+    echo "  Total content: " . number_format($totalSize) . " chars\n";
+    echo "  Total tables: $totalTables\n\n";
+}
+
+$formatConfigs = [
+    'pdf' => new ExtractionConfig(
+        extractTables: true,
+        extractImages: true,
+        pdf: new \Kreuzberg\Config\PdfConfig(
+            extractImages: true,
+            imageQuality: 85
+        )
+    ),
+    'docx' => new ExtractionConfig(
+        extractTables: true,
+        preserveFormatting: true
+    ),
+    'xlsx' => new ExtractionConfig(
+        extractTables: true  
+    ),
+    'png' => new ExtractionConfig(
+        ocr: new \Kreuzberg\Config\OcrConfig(
+            backend: 'tesseract',
+            language: 'eng'
+        )
+    ),
+];
+
+foreach ($mixedFiles as $file) {
+    $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
+
+    if (!isset($formatConfigs[$ext])) {
+        continue;
+    }
+
+    $config = $formatConfigs[$ext];
+    $kreuzberg = new Kreuzberg($config);
+    $result = $kreuzberg->extractFile($file);
+
+    echo "Processed " . basename($file) . " with $ext config\n";
+}
+
+function convertToMarkdown(string $inputFile): string
+{
+    $config = new ExtractionConfig(
+        preserveFormatting: true,
+        outputFormat: 'markdown',
+        extractTables: true
+    );
+
+    $kreuzberg = new Kreuzberg($config);
+    $result = $kreuzberg->extractFile($inputFile);
+
+    $markdown = "# " . ($result->metadata->title ?? basename($inputFile)) . "\n\n";
+
+    if (isset($result->metadata->authors)) {
+        $markdown .= "_Authors: " . implode(', ', $result->metadata->authors) . "_\n\n";
+    }
+
+    $markdown .= $result->content . "\n\n";
+
+    foreach ($result->tables as $index => $table) {
+        $markdown .= "## Table " . ($index + 1) . "\n\n";
+        $markdown .= $table->markdown . "\n\n";
+    }
+
+    return $markdown;
+}
+
+echo "\nConverting to Markdown:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach (['document.pdf', 'document.docx'] as $file) {
+    if (!file_exists($file)) {
+        continue;
+    }
+
+    $markdown = convertToMarkdown($file);
+    $outputFile = pathinfo($file, PATHINFO_FILENAME) . '.md';
+
+    file_put_contents($outputFile, $markdown);
+    echo "Converted: $file -> $outputFile\n";
+}
+
+function extractFromArchive(string $archiveFile): array
+{
+    $result = extract_file($archiveFile);
+
+    return [
+        'archive' => basename($archiveFile),
+        'listing' => $result->content,
+        'mime' => $result->mimeType,
+    ];
+}
+
+class UniversalExtractor
+{
+    private Kreuzberg $kreuzberg;
+    private array $formatHandlers = [];
+
+    public function __construct()
+    {
+        $this->kreuzberg = new Kreuzberg();
+
+        $this->formatHandlers = [
+            'application/pdf' => [$this, 'handlePDF'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => [$this, 'handleDOCX'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => [$this, 'handleXLSX'],
+            'image/png' => [$this, 'handleImage'],
+            'image/jpeg' => [$this, 'handleImage'],
+        ];
+    }
+
+    public function extract(string $file): array
+    {
+        $mimeType = detect_mime_type_from_path($file);
+        $handler = $this->formatHandlers[$mimeType] ?? [$this, 'handleGeneric'];
+
+        return $handler($file, $mimeType);
+    }
+
+    private function handlePDF(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(extractTables: true, extractImages: true);
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'PDF',
+            'content' => $result->content,
+            'tables' => count($result->tables),
+            'images' => count($result->images ?? []),
+            'pages' => $result->metadata->pageCount,
+        ];
+    }
+
+    private function handleDOCX(string $file, string $mimeType): array
+    {
+        $result = $this->kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Word Document',
+            'content' => $result->content,
+            'tables' => count($result->tables),
+            'authors' => $result->metadata->authors,
+        ];
+    }
+
+    private function handleXLSX(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(extractTables: true);
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Excel Spreadsheet',
+            'content' => $result->content,
+            'sheets' => count($result->tables),  
+        ];
+    }
+
+    private function handleImage(string $file, string $mimeType): array
+    {
+        $config = new ExtractionConfig(
+            ocr: new \Kreuzberg\Config\OcrConfig(backend: 'tesseract', language: 'eng')
+        );
+        $kreuzberg = new Kreuzberg($config);
+        $result = $kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Image (OCR)',
+            'content' => $result->content,
+            'ocr_length' => strlen($result->content),
+        ];
+    }
+
+    private function handleGeneric(string $file, string $mimeType): array
+    {
+        $result = $this->kreuzberg->extractFile($file);
+
+        return [
+            'type' => 'Generic',
+            'mime' => $mimeType,
+            'content' => $result->content,
+        ];
+    }
+}
+
+$extractor = new UniversalExtractor();
+
+echo "\nUniversal Extraction:\n";
+echo str_repeat('=', 60) . "\n";
+
+foreach ($mixedFiles as $file) {
+    $data = $extractor->extract($file);
+    echo basename($file) . " ({$data['type']}):\n";
+    print_r(array_filter($data, fn($k) => $k !== 'content', ARRAY_FILTER_USE_KEY));
+    echo "\n";
+}
+```