Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/metadata/language_detection.md
+++ b/docs/snippets/php/metadata/language_detection.md
@@ -0,0 +1,30 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\LanguageDetectionConfig;
+
+// Configure language detection with confidence threshold
+$langConfig = new LanguageDetectionConfig(
+    enabled: true,
+    minConfidence: 0.7,
+    detectMultiple: false
+);
+
+$config = new ExtractionConfig();
+$config->language_detection = $langConfig;
+
+$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
+
+// Access detected languages
+if (!empty($result->languages)) {
+    foreach ($result->languages as $lang) {
+        echo "Detected language: " . $lang->code . "\n";
+        if ($lang->confidence !== null) {
+            echo "Confidence: " . $lang->confidence . "\n";
+        }
+    }
+}
+?>
+```
--- a/docs/snippets/php/metadata/language_detection_multilingual.md
+++ b/docs/snippets/php/metadata/language_detection_multilingual.md
@@ -0,0 +1,37 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\LanguageDetectionConfig;
+
+// Configure multilingual language detection
+$langConfig = new LanguageDetectionConfig(
+    enabled: true,
+    minConfidence: 0.6,
+    detectMultiple: true
+);
+
+$config = new ExtractionConfig();
+$config->language_detection = $langConfig;
+
+$result = Kreuzberg::extract_file_sync("multilingual_document.pdf", null, $config);
+
+// Iterate through all detected languages
+if (!empty($result->languages)) {
+    echo "Detected " . count($result->languages) . " language(s):\n";
+
+    foreach ($result->languages as $lang) {
+        echo "Language: " . $lang->code . "\n";
+        if ($lang->confidence !== null) {
+            printf("  Confidence: %.1f%%\n", $lang->confidence * 100);
+        }
+        if ($lang->name !== null) {
+            echo "  Name: " . $lang->name . "\n";
+        }
+    }
+} else {
+    echo "No languages detected\n";
+}
+?>
+```
--- a/docs/snippets/php/metadata/metadata.md
+++ b/docs/snippets/php/metadata/metadata.md
@@ -0,0 +1,63 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+// Extract PDF metadata
+$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
+
+if ($result->metadata?->pdf) {
+    $pdfMeta = $result->metadata->pdf;
+    if ($pdfMeta->page_count !== null) {
+        echo "Pages: " . $pdfMeta->page_count . "\n";
+    }
+    if ($pdfMeta->author !== null) {
+        echo "Author: " . $pdfMeta->author . "\n";
+    }
+    if ($pdfMeta->title !== null) {
+        echo "Title: " . $pdfMeta->title . "\n";
+    }
+}
+
+// Extract HTML metadata
+$htmlResult = Kreuzberg::extract_file_sync("page.html", null, new ExtractionConfig());
+
+if ($htmlResult->metadata?->html) {
+    $htmlMeta = $htmlResult->metadata->html;
+    if ($htmlMeta->title !== null) {
+        echo "Title: " . $htmlMeta->title . "\n";
+    }
+    if ($htmlMeta->description !== null) {
+        echo "Description: " . $htmlMeta->description . "\n";
+    }
+
+    // Access keywords array
+    echo "Keywords: " . implode(", ", $htmlMeta->keywords ?? []) . "\n";
+
+    // Access canonical URL
+    if ($htmlMeta->canonical_url !== null) {
+        echo "Canonical: " . $htmlMeta->canonical_url . "\n";
+    }
+
+    // Access Open Graph fields
+    if (!empty($htmlMeta->open_graph)) {
+        if (isset($htmlMeta->open_graph["image"])) {
+            echo "OG Image: " . $htmlMeta->open_graph["image"] . "\n";
+        }
+    }
+
+    // Access language
+    if ($htmlMeta->language !== null) {
+        echo "Language: " . $htmlMeta->language . "\n";
+    }
+
+    // Access headers
+    if (!empty($htmlMeta->headers)) {
+        foreach ($htmlMeta->headers as $header) {
+            echo "Header (level " . $header->level . "): " . $header->text . "\n";
+        }
+    }
+}
+?>
+```
--- a/docs/snippets/php/metadata/metadata.php
+++ b/docs/snippets/php/metadata/metadata.php
@@ -0,0 +1,33 @@
+```php title="metadata.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Document Metadata Access
+ *
+ * Extract and access metadata from different document types including
+ * PDFs, HTML, and other formats.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use function Kreuzberg\extract_file;
+
+$result = extract_file('document.pdf');
+
+if (isset($result->metadata->pdf)) {
+    $pdfMeta = $result->metadata->pdf;
+    echo "Pages: " . ($pdfMeta['page_count'] ?? 'N/A') . "\n";
+    echo "Author: " . ($pdfMeta['author'] ?? 'N/A') . "\n";
+    echo "Title: " . ($pdfMeta['title'] ?? 'N/A') . "\n";
+}
+
+$htmlResult = extract_file('page.html');
+
+if (isset($htmlResult->metadata->html)) {
+    $htmlMeta = $htmlResult->metadata->html;
+    echo "Title: " . ($htmlMeta['title'] ?? 'N/A') . "\n";
+    echo "Description: " . ($htmlMeta['description'] ?? 'N/A') . "\n";
+}
+```
--- a/docs/snippets/php/metadata/page_boundaries.md
+++ b/docs/snippets/php/metadata/page_boundaries.md
@@ -0,0 +1,29 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\PageConfig;
+
+$config = new ExtractionConfig();
+$config->pages = new PageConfig(
+    extractPages: true,
+    insertPageMarkers: true,
+    markerFormat: "\n\n=== PAGE {page_num} ===\n\n"
+);
+
+$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
+
+// Content with inline page markers
+echo "Full content with markers:\n";
+echo $result->content . "\n\n";
+
+// Or access pages separately with boundaries preserved
+if ($result->pages !== null) {
+    foreach ($result->pages as $page) {
+        echo "--- Page " . $page->page_number . " (boundary) ---\n";
+        echo $page->content . "\n";
+    }
+}
+?>
+```
--- a/docs/snippets/php/metadata/page_boundaries.php
+++ b/docs/snippets/php/metadata/page_boundaries.php
@@ -0,0 +1,37 @@
+```php title="page_boundaries.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Page Boundary Tracking
+ *
+ * Access page boundary information to extract content from specific pages
+ * using byte offsets in the extracted content.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use function Kreuzberg\extract_file;
+
+$result = extract_file('document.pdf');
+
+if (isset($result->metadata->pages->boundaries) && !empty($result->metadata->pages->boundaries)) {
+    $boundaries = $result->metadata->pages->boundaries;
+    $contentBytes = $result->content;
+
+    $pagesToShow = array_slice($boundaries, 0, 3);
+
+    foreach ($pagesToShow as $boundary) {
+        $pageContent = mb_substr(
+            $contentBytes,
+            $boundary->byteStart,
+            $boundary->byteEnd - $boundary->byteStart
+        );
+
+        echo "Page {$boundary->pageNumber}:\n";
+        echo "  Byte range: {$boundary->byteStart}-{$boundary->byteEnd}\n";
+        echo "  Preview: " . mb_substr($pageContent, 0, 100) . "...\n\n";
+    }
+}
+```
--- a/docs/snippets/php/metadata/page_tracking_basic.md
+++ b/docs/snippets/php/metadata/page_tracking_basic.md
@@ -0,0 +1,26 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\PageConfig;
+
+$config = new ExtractionConfig();
+$config->pages = new PageConfig(
+    extractPages: true,
+    insertPageMarkers: false,
+    markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
+);
+
+$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
+
+if ($result->pages !== null) {
+    foreach ($result->pages as $page) {
+        echo "Page " . $page->page_number . ":\n";
+        echo "  Content: " . strlen($page->content) . " chars\n";
+        echo "  Tables: " . count($page->tables ?? []) . "\n";
+        echo "  Images: " . count($page->images ?? []) . "\n";
+    }
+}
+?>
+```
--- a/docs/snippets/php/metadata/page_tracking_basic.php
+++ b/docs/snippets/php/metadata/page_tracking_basic.php
@@ -0,0 +1,36 @@
+```php title="page_tracking_basic.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Basic Page Tracking
+ *
+ * Extract individual pages with their content, tables, and images
+ * using page extraction configuration.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\PageConfig;
+
+$config = new ExtractionConfig(
+    pages: new PageConfig(
+        extractPages: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+if (!empty($result->pages)) {
+    foreach ($result->pages as $page) {
+        echo "Page {$page->pageNumber}:\n";
+        echo "  Content: " . strlen($page->content) . " chars\n";
+        echo "  Tables: " . count($page->tables) . "\n";
+        echo "  Images: " . count($page->images) . "\n\n";
+    }
+}
+```
--- a/docs/snippets/php/metadata/pdf_metadata_extractor.php
+++ b/docs/snippets/php/metadata/pdf_metadata_extractor.php
@@ -0,0 +1,114 @@
+```php title="pdf_metadata_extractor.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * PDF Metadata Extractor Post-Processor
+ *
+ * Custom post-processor that extracts and enriches PDF metadata
+ * during the extraction pipeline.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\PostProcessor\PostProcessorInterface;
+use Kreuzberg\Types\ExtractionResult;
+use Kreuzberg\Kreuzberg;
+
+/**
+ * Post-processor for extracting and enriching PDF metadata
+ */
+readonly class PdfMetadataExtractor implements PostProcessorInterface
+{
+    private int $processedCount;
+
+    public function __construct()
+    {
+        $this->processedCount = 0;
+    }
+
+    /**
+     * Get the name of this post-processor
+     */
+    public function name(): string
+    {
+        return 'pdf_metadata_extractor';
+    }
+
+    /**
+     * Get the version of this post-processor
+     */
+    public function version(): string
+    {
+        return '1.0.0';
+    }
+
+    /**
+     * Get the description of this post-processor
+     */
+    public function description(): string
+    {
+        return 'Extracts and enriches PDF metadata';
+    }
+
+    /**
+     * Get the processing stage (early, normal, or late)
+     */
+    public function processingStage(): string
+    {
+        return 'early';
+    }
+
+    /**
+     * Determine if this processor should handle the result
+     */
+    public function shouldProcess(ExtractionResult $result): bool
+    {
+        return $result->mimeType === 'application/pdf';
+    }
+
+    /**
+     * Process the extraction result
+     */
+    public function process(ExtractionResult $result): ExtractionResult
+    {
+        $this->processedCount++;
+
+        if (!isset($result->metadata->custom)) {
+            $result->metadata->custom = [];
+        }
+        $result->metadata->custom['pdf_processed'] = true;
+        $result->metadata->custom['processor_version'] = $this->version();
+
+        return $result;
+    }
+
+    /**
+     * Initialize the post-processor
+     */
+    public function initialize(): void
+    {
+        error_log("PDF metadata extractor initialized");
+    }
+
+    /**
+     * Shutdown the post-processor
+     */
+    public function shutdown(): void
+    {
+        error_log("Processed {$this->processedCount} PDFs");
+    }
+
+    /**
+     * Get the number of processed documents
+     */
+    public function getProcessedCount(): int
+    {
+        return $this->processedCount;
+    }
+}
+
+$processor = new PdfMetadataExtractor();
+Kreuzberg::registerPostProcessor($processor);
+```
--- a/docs/snippets/php/metadata/tables.md
+++ b/docs/snippets/php/metadata/tables.md
@@ -0,0 +1,22 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
+
+foreach ($result->tables as $table) {
+    echo "Table on page " . $table->page_number . " with " . count($table->cells) . " rows\n";
+    echo "Markdown representation:\n";
+    echo $table->markdown . "\n";
+
+    // Access cell data
+    foreach ($table->cells as $rowIndex => $row) {
+        foreach ($row as $colIndex => $cellContent) {
+            echo "Cell[$rowIndex][$colIndex]: $cellContent\n";
+        }
+    }
+}
+?>
+```
--- a/docs/snippets/php/metadata/vector_database_integration.md
+++ b/docs/snippets/php/metadata/vector_database_integration.md
@@ -0,0 +1,50 @@
+```php title="PHP"
+<?php declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\ChunkingConfig;
+use Kreuzberg\EmbeddingConfig;
+
+// Configure chunking with embedding generation for vector database
+$chunkConfig = new ChunkingConfig(
+    enableChunking: true,
+    chunkSize: 512,
+    chunkOverlap: 50,
+    chunker: "semantic"
+);
+
+$embeddingConfig = new EmbeddingConfig(
+    generateEmbeddings: true,
+    modelName: "all-minilm-l6-v2"
+);
+
+$config = new ExtractionConfig();
+$config->chunking = $chunkConfig;
+$config->embeddings = $embeddingConfig;
+
+$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
+
+// Store chunks and embeddings for vector database
+if ($result->chunks !== null) {
+    foreach ($result->chunks as $chunk) {
+        // Store in vector database with embedding
+        $vectorRecord = [
+            "text" => $chunk->text,
+            "embedding" => $chunk->embedding ?? [],
+            "metadata" => [
+                "source" => "document.pdf",
+                "page" => $chunk->page_number ?? null,
+                "chunk_id" => $chunk->chunk_id ?? null,
+            ]
+        ];
+
+        // Insert into vector DB (e.g., Pinecone, Weaviate, Milvus)
+        // storeInVectorDB($vectorRecord);
+
+        echo "Chunk: " . substr($chunk->text, 0, 50) . "...\n";
+        echo "Embedding dimensions: " . count($chunk->embedding ?? []) . "\n";
+    }
+}
+?>
+```