Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/embeddings/basic_embeddings.php
+++ b/docs/snippets/php/embeddings/basic_embeddings.php
@@ -0,0 +1,215 @@
+```php title="basic_embeddings.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Basic Embedding Generation
+ *
+ * Generate vector embeddings for semantic search and similarity matching.
+ * Requires ONNX Runtime to be installed.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,
+        chunkOverlap: 50
+    ),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Embedding Generation Results:\n";
+echo str_repeat('=', 60) . "\n";
+echo "Chunks with embeddings: " . count($result->chunks ?? []) . "\n\n";
+
+foreach ($result->chunks ?? [] as $chunk) {
+    echo "Chunk {$chunk->metadata->chunkIndex}:\n";
+    echo "  Content length: " . strlen($chunk->content) . " chars\n";
+
+    if ($chunk->embedding !== null) {
+        echo "  Embedding dimension: " . count($chunk->embedding) . "\n";
+        echo "  First 5 values: [" . implode(', ', array_map(
+            fn($v) => number_format($v, 4),
+            array_slice($chunk->embedding, 0, 5)
+        )) . "...]\n";
+    }
+    echo "\n";
+}
+
+$models = [
+    'all-MiniLM-L6-v2',      
+    'all-mpnet-base-v2',     
+    'paraphrase-multilingual-MiniLM-L12-v2', 
+];
+
+foreach ($models as $model) {
+    echo "Testing model: $model\n";
+
+    $config = new ExtractionConfig(
+        chunking: new ChunkingConfig(maxChunkSize: 256),
+        embedding: new EmbeddingConfig(
+            model: $model,
+            normalize: true
+        )
+    );
+
+    $kreuzberg = new Kreuzberg($config);
+    $start = microtime(true);
+    $result = $kreuzberg->extractFile('test_doc.pdf');
+    $elapsed = microtime(true) - $start;
+
+    $chunk = ($result->chunks ?? [])[0] ?? null;
+    if ($chunk && $chunk->embedding) {
+        echo "  Dimension: " . count($chunk->embedding) . "\n";
+        echo "  Time: " . number_format($elapsed, 3) . "s\n";
+        echo "  Chunks: " . count($result->chunks ?? []) . "\n\n";
+    }
+}
+
+function cosineSimilarity(array $a, array $b): float
+{
+    $dotProduct = 0.0;
+    $magnitudeA = 0.0;
+    $magnitudeB = 0.0;
+
+    for ($i = 0; $i < count($a); $i++) {
+        $dotProduct += $a[$i] * $b[$i];
+        $magnitudeA += $a[$i] * $a[$i];
+        $magnitudeB += $b[$i] * $b[$i];
+    }
+
+    return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
+}
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('document.pdf');
+
+echo "Chunk Similarity Analysis:\n";
+echo str_repeat('=', 60) . "\n";
+
+$chunks = $result->chunks ?? [];
+if (count($chunks) >= 2) {
+    $referenceChunk = $chunks[0];
+
+    foreach (array_slice($chunks, 1, 5) as $chunk) {
+        if ($referenceChunk->embedding && $chunk->embedding) {
+            $similarity = cosineSimilarity(
+                $referenceChunk->embedding,
+                $chunk->embedding
+            );
+
+            echo "Chunk 0 vs Chunk {$chunk->metadata->chunkIndex}: ";
+            echo number_format($similarity, 4) . "\n";
+        }
+    }
+}
+echo "\n";
+
+class SimpleVectorDB
+{
+    private array $vectors = [];
+
+    public function add(string $id, array $embedding, string $content): void
+    {
+        $this->vectors[$id] = [
+            'embedding' => $embedding,
+            'content' => $content,
+        ];
+    }
+
+    public function search(array $queryEmbedding, int $k = 5): array
+    {
+        $results = [];
+
+        foreach ($this->vectors as $id => $data) {
+            $similarity = $this->cosineSimilarity($queryEmbedding, $data['embedding']);
+            $results[] = [
+                'id' => $id,
+                'similarity' => $similarity,
+                'content' => $data['content'],
+            ];
+        }
+
+        usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
+
+        return array_slice($results, 0, $k);
+    }
+
+    private function cosineSimilarity(array $a, array $b): float
+    {
+        $dotProduct = 0.0;
+        $magA = 0.0;
+        $magB = 0.0;
+
+        for ($i = 0; $i < count($a); $i++) {
+            $dotProduct += $a[$i] * $b[$i];
+            $magA += $a[$i] * $a[$i];
+            $magB += $b[$i] * $b[$i];
+        }
+
+        return $dotProduct / (sqrt($magA) * sqrt($magB));
+    }
+}
+
+$db = new SimpleVectorDB();
+
+$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
+foreach ($files as $file) {
+    if (!file_exists($file)) continue;
+
+    $result = $kreuzberg->extractFile($file);
+
+    foreach ($result->chunks ?? [] as $chunk) {
+        if ($chunk->embedding) {
+            $id = $file . '_chunk_' . $chunk->metadata->chunkIndex;
+            $db->add($id, $chunk->embedding, $chunk->content);
+        }
+    }
+}
+
+echo "Vector database built\n";
+echo "Ready for semantic search!\n";
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(maxChunkSize: 512),
+    embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
+);
+
+$kreuzberg = new Kreuzberg($config);
+$result = $kreuzberg->extractFile('export_doc.pdf');
+
+$exportData = [];
+foreach ($result->chunks ?? [] as $chunk) {
+    $exportData[] = [
+        'id' => uniqid('vec_', true),
+        'text' => $chunk->content,
+        'embedding' => $chunk->embedding,
+        'metadata' => [
+            'chunk_index' => $chunk->metadata->chunkIndex,
+            'source' => 'export_doc.pdf',
+            'timestamp' => time(),
+        ],
+    ];
+}
+
+file_put_contents('embeddings_export.json', json_encode($exportData));
+echo "\nExported " . count($exportData) . " embeddings to embeddings_export.json\n";
+```
--- a/docs/snippets/php/embeddings/semantic_search.php
+++ b/docs/snippets/php/embeddings/semantic_search.php
@@ -0,0 +1,221 @@
+```php title="semantic_search.php"
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Semantic Search with Embeddings
+ *
+ * Build a semantic search system using document embeddings.
+ * Find relevant content based on meaning, not just keywords.
+ */
+
+require_once __DIR__ . '/vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\Config\ExtractionConfig;
+use Kreuzberg\Config\ChunkingConfig;
+use Kreuzberg\Config\EmbeddingConfig;
+
+$config = new ExtractionConfig(
+    chunking: new ChunkingConfig(
+        maxChunkSize: 512,
+        chunkOverlap: 50,
+        respectSentences: true
+    ),
+    embedding: new EmbeddingConfig(
+        model: 'all-MiniLM-L6-v2',
+        normalize: true
+    )
+);
+
+$kreuzberg = new Kreuzberg($config);
+
+echo "Building document index...\n";
+$documentIndex = [];
+
+$files = glob('knowledge_base/*.pdf');
+foreach ($files as $file) {
+    echo "Indexing: " . basename($file) . "\n";
+
+    $result = $kreuzberg->extractFile($file);
+
+    foreach ($result->chunks ?? [] as $chunk) {
+        if ($chunk->embedding) {
+            $documentIndex[] = [
+                'file' => basename($file),
+                'chunk_index' => $chunk->metadata->chunkIndex,
+                'content' => $chunk->content,
+                'embedding' => $chunk->embedding,
+                'metadata' => [
+                    'title' => $result->metadata->title ?? basename($file),
+                    'author' => $result->metadata->author ?? 'Unknown',
+                ],
+            ];
+        }
+    }
+}
+
+echo "Indexed " . count($documentIndex) . " chunks from " . count($files) . " documents\n\n";
+
+function semanticSearch(array $index, array $queryEmbedding, int $topK = 5): array
+{
+    $results = [];
+
+    foreach ($index as $item) {
+        $similarity = cosineSimilarity($queryEmbedding, $item['embedding']);
+        $results[] = array_merge($item, ['similarity' => $similarity]);
+    }
+
+    usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
+
+    return array_slice($results, 0, $topK);
+}
+
+function cosineSimilarity(array $a, array $b): float
+{
+    $dotProduct = $magnitudeA = $magnitudeB = 0.0;
+
+    for ($i = 0; $i < count($a); $i++) {
+        $dotProduct += $a[$i] * $b[$i];
+        $magnitudeA += $a[$i] * $a[$i];
+        $magnitudeB += $b[$i] * $b[$i];
+    }
+
+    return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
+}
+
+function getQueryEmbedding(Kreuzberg $kreuzberg, string $query): ?array
+{
+    $tempFile = tempnam(sys_get_temp_dir(), 'query_');
+    file_put_contents($tempFile, $query);
+
+    try {
+        $result = $kreuzberg->extractFile($tempFile);
+        $chunk = ($result->chunks ?? [])[0] ?? null;
+        return $chunk?->embedding;
+    } finally {
+        unlink($tempFile);
+    }
+}
+
+$queries = [
+    "What are the key features of the product?",
+    "How do I install and configure the system?",
+    "What are the pricing options?",
+    "How does authentication work?",
+    "What are the performance benchmarks?",
+];
+
+foreach ($queries as $query) {
+    echo "Query: \"$query\"\n";
+    echo str_repeat('=', 60) . "\n";
+
+    $queryEmbedding = getQueryEmbedding($kreuzberg, $query);
+
+    if ($queryEmbedding) {
+        $results = semanticSearch($documentIndex, $queryEmbedding, 3);
+
+        foreach ($results as $index => $result) {
+            echo "\nResult " . ($index + 1) . " (similarity: " .
+                number_format($result['similarity'], 4) . "):\n";
+            echo "File: {$result['file']}\n";
+            echo "Title: {$result['metadata']['title']}\n";
+            echo "Content: " . substr($result['content'], 0, 200) . "...\n";
+        }
+    }
+
+    echo "\n" . str_repeat('-', 60) . "\n\n";
+}
+
+function buildRAGContext(array $searchResults, int $maxTokens = 2000): string
+{
+    $context = "Relevant context:\n\n";
+    $currentTokens = 0;
+
+    foreach ($searchResults as $result) {
+        $tokens = strlen($result['content']) / 4; 
+
+        if ($currentTokens + $tokens > $maxTokens) {
+            break;
+        }
+
+        $context .= "From {$result['file']}:\n";
+        $context .= $result['content'] . "\n\n";
+        $currentTokens += $tokens;
+    }
+
+    return $context;
+}
+
+$userQuestion = "How do I optimize performance?";
+$queryEmbedding = getQueryEmbedding($kreuzberg, $userQuestion);
+
+if ($queryEmbedding) {
+    $results = semanticSearch($documentIndex, $queryEmbedding, 5);
+    $context = buildRAGContext($results);
+
+    echo "RAG Context for: \"$userQuestion\"\n";
+    echo str_repeat('=', 60) . "\n";
+    echo $context;
+    echo "\nContext ready for LLM prompt!\n";
+}
+
+file_put_contents(
+    'document_index.json',
+    json_encode($documentIndex, JSON_PRETTY_PRINT)
+);
+echo "\nSaved document index to: document_index.json\n";
+
+function multiQuerySearch(array $index, array $queries, Kreuzberg $kreuzberg): array
+{
+    $allResults = [];
+
+    foreach ($queries as $query) {
+        $queryEmbedding = getQueryEmbedding($kreuzberg, $query);
+        if ($queryEmbedding) {
+            $results = semanticSearch($index, $queryEmbedding, 10);
+            $allResults = array_merge($allResults, $results);
+        }
+    }
+
+    $grouped = [];
+    foreach ($allResults as $result) {
+        $key = $result['file'] . '_' . $result['chunk_index'];
+        if (!isset($grouped[$key])) {
+            $grouped[$key] = [
+                'result' => $result,
+                'similarities' => [],
+            ];
+        }
+        $grouped[$key]['similarities'][] = $result['similarity'];
+    }
+
+    $final = [];
+    foreach ($grouped as $data) {
+        $avgSimilarity = array_sum($data['similarities']) / count($data['similarities']);
+        $final[] = array_merge($data['result'], ['avg_similarity' => $avgSimilarity]);
+    }
+
+    usort($final, fn($a, $b) => $b['avg_similarity'] <=> $a['avg_similarity']);
+
+    return array_slice($final, 0, 5);
+}
+
+$relatedQueries = [
+    "system requirements",
+    "installation steps",
+    "getting started guide",
+];
+
+echo "\nMulti-query search results:\n";
+echo str_repeat('=', 60) . "\n";
+
+$results = multiQuerySearch($documentIndex, $relatedQueries, $kreuzberg);
+
+foreach ($results as $index => $result) {
+    echo "\n" . ($index + 1) . ". {$result['file']}\n";
+    echo "   Average similarity: " . number_format($result['avg_similarity'], 4) . "\n";
+    echo "   " . substr($result['content'], 0, 150) . "...\n";
+}
+```