This commit is contained in:
215
docs/snippets/php/embeddings/basic_embeddings.php
Normal file
215
docs/snippets/php/embeddings/basic_embeddings.php
Normal file
@@ -0,0 +1,215 @@
|
||||
```php title="basic_embeddings.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic Embedding Generation
|
||||
*
|
||||
* Generate vector embeddings for semantic search and similarity matching.
|
||||
* Requires ONNX Runtime to be installed.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50
|
||||
),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Embedding Generation Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Chunks with embeddings: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
echo "Chunk {$chunk->metadata->chunkIndex}:\n";
|
||||
echo " Content length: " . strlen($chunk->content) . " chars\n";
|
||||
|
||||
if ($chunk->embedding !== null) {
|
||||
echo " Embedding dimension: " . count($chunk->embedding) . "\n";
|
||||
echo " First 5 values: [" . implode(', ', array_map(
|
||||
fn($v) => number_format($v, 4),
|
||||
array_slice($chunk->embedding, 0, 5)
|
||||
)) . "...]\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$models = [
|
||||
'all-MiniLM-L6-v2',
|
||||
'all-mpnet-base-v2',
|
||||
'paraphrase-multilingual-MiniLM-L12-v2',
|
||||
];
|
||||
|
||||
foreach ($models as $model) {
|
||||
echo "Testing model: $model\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 256),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: $model,
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$start = microtime(true);
|
||||
$result = $kreuzberg->extractFile('test_doc.pdf');
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
$chunk = ($result->chunks ?? [])[0] ?? null;
|
||||
if ($chunk && $chunk->embedding) {
|
||||
echo " Dimension: " . count($chunk->embedding) . "\n";
|
||||
echo " Time: " . number_format($elapsed, 3) . "s\n";
|
||||
echo " Chunks: " . count($result->chunks ?? []) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = 0.0;
|
||||
$magnitudeA = 0.0;
|
||||
$magnitudeB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magnitudeA += $a[$i] * $a[$i];
|
||||
$magnitudeB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Chunk Similarity Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$chunks = $result->chunks ?? [];
|
||||
if (count($chunks) >= 2) {
|
||||
$referenceChunk = $chunks[0];
|
||||
|
||||
foreach (array_slice($chunks, 1, 5) as $chunk) {
|
||||
if ($referenceChunk->embedding && $chunk->embedding) {
|
||||
$similarity = cosineSimilarity(
|
||||
$referenceChunk->embedding,
|
||||
$chunk->embedding
|
||||
);
|
||||
|
||||
echo "Chunk 0 vs Chunk {$chunk->metadata->chunkIndex}: ";
|
||||
echo number_format($similarity, 4) . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
class SimpleVectorDB
|
||||
{
|
||||
private array $vectors = [];
|
||||
|
||||
public function add(string $id, array $embedding, string $content): void
|
||||
{
|
||||
$this->vectors[$id] = [
|
||||
'embedding' => $embedding,
|
||||
'content' => $content,
|
||||
];
|
||||
}
|
||||
|
||||
public function search(array $queryEmbedding, int $k = 5): array
|
||||
{
|
||||
$results = [];
|
||||
|
||||
foreach ($this->vectors as $id => $data) {
|
||||
$similarity = $this->cosineSimilarity($queryEmbedding, $data['embedding']);
|
||||
$results[] = [
|
||||
'id' => $id,
|
||||
'similarity' => $similarity,
|
||||
'content' => $data['content'],
|
||||
];
|
||||
}
|
||||
|
||||
usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
|
||||
|
||||
return array_slice($results, 0, $k);
|
||||
}
|
||||
|
||||
private function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = 0.0;
|
||||
$magA = 0.0;
|
||||
$magB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magA += $a[$i] * $a[$i];
|
||||
$magB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magA) * sqrt($magB));
|
||||
}
|
||||
}
|
||||
|
||||
$db = new SimpleVectorDB();
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
|
||||
foreach ($files as $file) {
|
||||
if (!file_exists($file)) continue;
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding) {
|
||||
$id = $file . '_chunk_' . $chunk->metadata->chunkIndex;
|
||||
$db->add($id, $chunk->embedding, $chunk->content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Vector database built\n";
|
||||
echo "Ready for semantic search!\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('export_doc.pdf');
|
||||
|
||||
$exportData = [];
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
$exportData[] = [
|
||||
'id' => uniqid('vec_', true),
|
||||
'text' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'chunk_index' => $chunk->metadata->chunkIndex,
|
||||
'source' => 'export_doc.pdf',
|
||||
'timestamp' => time(),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
file_put_contents('embeddings_export.json', json_encode($exportData));
|
||||
echo "\nExported " . count($exportData) . " embeddings to embeddings_export.json\n";
|
||||
```
|
||||
221
docs/snippets/php/embeddings/semantic_search.php
Normal file
221
docs/snippets/php/embeddings/semantic_search.php
Normal file
@@ -0,0 +1,221 @@
|
||||
```php title="semantic_search.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Semantic Search with Embeddings
|
||||
*
|
||||
* Build a semantic search system using document embeddings.
|
||||
* Find relevant content based on meaning, not just keywords.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true
|
||||
),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
|
||||
echo "Building document index...\n";
|
||||
$documentIndex = [];
|
||||
|
||||
$files = glob('knowledge_base/*.pdf');
|
||||
foreach ($files as $file) {
|
||||
echo "Indexing: " . basename($file) . "\n";
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding) {
|
||||
$documentIndex[] = [
|
||||
'file' => basename($file),
|
||||
'chunk_index' => $chunk->metadata->chunkIndex,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'title' => $result->metadata->title ?? basename($file),
|
||||
'author' => $result->metadata->author ?? 'Unknown',
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Indexed " . count($documentIndex) . " chunks from " . count($files) . " documents\n\n";
|
||||
|
||||
function semanticSearch(array $index, array $queryEmbedding, int $topK = 5): array
|
||||
{
|
||||
$results = [];
|
||||
|
||||
foreach ($index as $item) {
|
||||
$similarity = cosineSimilarity($queryEmbedding, $item['embedding']);
|
||||
$results[] = array_merge($item, ['similarity' => $similarity]);
|
||||
}
|
||||
|
||||
usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
|
||||
|
||||
return array_slice($results, 0, $topK);
|
||||
}
|
||||
|
||||
function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = $magnitudeA = $magnitudeB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magnitudeA += $a[$i] * $a[$i];
|
||||
$magnitudeB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
|
||||
}
|
||||
|
||||
function getQueryEmbedding(Kreuzberg $kreuzberg, string $query): ?array
|
||||
{
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'query_');
|
||||
file_put_contents($tempFile, $query);
|
||||
|
||||
try {
|
||||
$result = $kreuzberg->extractFile($tempFile);
|
||||
$chunk = ($result->chunks ?? [])[0] ?? null;
|
||||
return $chunk?->embedding;
|
||||
} finally {
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
$queries = [
|
||||
"What are the key features of the product?",
|
||||
"How do I install and configure the system?",
|
||||
"What are the pricing options?",
|
||||
"How does authentication work?",
|
||||
"What are the performance benchmarks?",
|
||||
];
|
||||
|
||||
foreach ($queries as $query) {
|
||||
echo "Query: \"$query\"\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $query);
|
||||
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($documentIndex, $queryEmbedding, 3);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo "\nResult " . ($index + 1) . " (similarity: " .
|
||||
number_format($result['similarity'], 4) . "):\n";
|
||||
echo "File: {$result['file']}\n";
|
||||
echo "Title: {$result['metadata']['title']}\n";
|
||||
echo "Content: " . substr($result['content'], 0, 200) . "...\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('-', 60) . "\n\n";
|
||||
}
|
||||
|
||||
function buildRAGContext(array $searchResults, int $maxTokens = 2000): string
|
||||
{
|
||||
$context = "Relevant context:\n\n";
|
||||
$currentTokens = 0;
|
||||
|
||||
foreach ($searchResults as $result) {
|
||||
$tokens = strlen($result['content']) / 4;
|
||||
|
||||
if ($currentTokens + $tokens > $maxTokens) {
|
||||
break;
|
||||
}
|
||||
|
||||
$context .= "From {$result['file']}:\n";
|
||||
$context .= $result['content'] . "\n\n";
|
||||
$currentTokens += $tokens;
|
||||
}
|
||||
|
||||
return $context;
|
||||
}
|
||||
|
||||
$userQuestion = "How do I optimize performance?";
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $userQuestion);
|
||||
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($documentIndex, $queryEmbedding, 5);
|
||||
$context = buildRAGContext($results);
|
||||
|
||||
echo "RAG Context for: \"$userQuestion\"\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $context;
|
||||
echo "\nContext ready for LLM prompt!\n";
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
'document_index.json',
|
||||
json_encode($documentIndex, JSON_PRETTY_PRINT)
|
||||
);
|
||||
echo "\nSaved document index to: document_index.json\n";
|
||||
|
||||
function multiQuerySearch(array $index, array $queries, Kreuzberg $kreuzberg): array
|
||||
{
|
||||
$allResults = [];
|
||||
|
||||
foreach ($queries as $query) {
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $query);
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($index, $queryEmbedding, 10);
|
||||
$allResults = array_merge($allResults, $results);
|
||||
}
|
||||
}
|
||||
|
||||
$grouped = [];
|
||||
foreach ($allResults as $result) {
|
||||
$key = $result['file'] . '_' . $result['chunk_index'];
|
||||
if (!isset($grouped[$key])) {
|
||||
$grouped[$key] = [
|
||||
'result' => $result,
|
||||
'similarities' => [],
|
||||
];
|
||||
}
|
||||
$grouped[$key]['similarities'][] = $result['similarity'];
|
||||
}
|
||||
|
||||
$final = [];
|
||||
foreach ($grouped as $data) {
|
||||
$avgSimilarity = array_sum($data['similarities']) / count($data['similarities']);
|
||||
$final[] = array_merge($data['result'], ['avg_similarity' => $avgSimilarity]);
|
||||
}
|
||||
|
||||
usort($final, fn($a, $b) => $b['avg_similarity'] <=> $a['avg_similarity']);
|
||||
|
||||
return array_slice($final, 0, 5);
|
||||
}
|
||||
|
||||
$relatedQueries = [
|
||||
"system requirements",
|
||||
"installation steps",
|
||||
"getting started guide",
|
||||
];
|
||||
|
||||
echo "\nMulti-query search results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$results = multiQuerySearch($documentIndex, $relatedQueries, $kreuzberg);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo "\n" . ($index + 1) . ". {$result['file']}\n";
|
||||
echo " Average similarity: " . number_format($result['avg_similarity'], 4) . "\n";
|
||||
echo " " . substr($result['content'], 0, 150) . "...\n";
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user