This commit is contained in:
284
docs/snippets/php/utils/vector_database_integration.php
Normal file
284
docs/snippets/php/utils/vector_database_integration.php
Normal file
@@ -0,0 +1,284 @@
|
||||
```php title="vector_database_integration.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Vector Database Integration
|
||||
*
|
||||
* Extract documents with chunking and embeddings for vector database storage.
|
||||
* Demonstrates preparing data for semantic search and RAG applications.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Vector Database Integration:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: document.pdf\n";
|
||||
echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
$vectorRecords = [];
|
||||
|
||||
foreach ($result->chunks ?? [] as $index => $chunk) {
|
||||
if ($chunk->embedding === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = sprintf(
|
||||
'doc_%s_chunk_%d',
|
||||
md5('document.pdf'),
|
||||
$index
|
||||
);
|
||||
|
||||
$vectorRecords[] = [
|
||||
'id' => $chunkId,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'source_file' => 'document.pdf',
|
||||
'chunk_index' => $index,
|
||||
'chunk_length' => strlen($chunk->content),
|
||||
'embedding_model' => 'balanced',
|
||||
'created_at' => date('c'),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
echo "Prepared " . count($vectorRecords) . " records for vector database\n\n";
|
||||
|
||||
if (!empty($vectorRecords)) {
|
||||
echo "Sample Vector Record Structure:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$sample = $vectorRecords[0];
|
||||
echo "ID: {$sample['id']}\n";
|
||||
echo "Content preview: " . substr($sample['content'], 0, 100) . "...\n";
|
||||
echo "Embedding dimensions: " . count($sample['embedding']) . "\n";
|
||||
echo "Metadata keys: " . implode(', ', array_keys($sample['metadata'])) . "\n\n";
|
||||
}
|
||||
|
||||
function insertIntoPinecone(array $records, string $namespace = 'default'): void
|
||||
{
|
||||
|
||||
echo "Inserting into Pinecone:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$batches = array_chunk($records, 100);
|
||||
|
||||
foreach ($batches as $batchIndex => $batch) {
|
||||
echo sprintf(
|
||||
"Batch %d: Upserting %d vectors to namespace '%s'...\n",
|
||||
$batchIndex + 1,
|
||||
count($batch),
|
||||
$namespace
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
echo "Completed inserting " . count($records) . " vectors\n\n";
|
||||
}
|
||||
|
||||
function insertIntoWeaviate(array $records, string $className = 'Document'): void
|
||||
{
|
||||
|
||||
echo "Inserting into Weaviate:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
foreach ($records as $index => $record) {
|
||||
$object = [
|
||||
'class' => $className,
|
||||
'properties' => [
|
||||
'content' => $record['content'],
|
||||
'sourceFile' => $record['metadata']['source_file'],
|
||||
'chunkIndex' => $record['metadata']['chunk_index'],
|
||||
'createdAt' => $record['metadata']['created_at'],
|
||||
],
|
||||
'vector' => $record['embedding'],
|
||||
];
|
||||
|
||||
|
||||
if (($index + 1) % 10 === 0) {
|
||||
echo sprintf("Inserted %d/%d objects\n", $index + 1, count($records));
|
||||
}
|
||||
}
|
||||
|
||||
echo "Completed inserting " . count($records) . " objects\n\n";
|
||||
}
|
||||
|
||||
function insertIntoQdrant(
|
||||
array $records,
|
||||
string $collectionName = 'documents'
|
||||
): void {
|
||||
|
||||
echo "Inserting into Qdrant:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$points = [];
|
||||
|
||||
foreach ($records as $record) {
|
||||
$points[] = [
|
||||
'id' => $record['id'],
|
||||
'vector' => $record['embedding'],
|
||||
'payload' => [
|
||||
'content' => $record['content'],
|
||||
'metadata' => $record['metadata'],
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
echo sprintf(
|
||||
"Upserting %d points to collection '%s'...\n",
|
||||
count($points),
|
||||
$collectionName
|
||||
);
|
||||
|
||||
|
||||
echo "Completed\n\n";
|
||||
}
|
||||
|
||||
echo "Vector Database Integration Examples:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
insertIntoPinecone($vectorRecords, 'documents');
|
||||
|
||||
insertIntoWeaviate($vectorRecords, 'DocumentChunk');
|
||||
|
||||
insertIntoQdrant($vectorRecords, 'document_chunks');
|
||||
|
||||
$documents = [
|
||||
'doc1.pdf',
|
||||
'doc2.pdf',
|
||||
'doc3.pdf',
|
||||
];
|
||||
|
||||
echo "Batch Processing for Vector Database:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$allVectorRecords = [];
|
||||
|
||||
$vectorConfig = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($vectorConfig);
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
echo " Chunks: " . count($result->chunks ?? []) . "\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $index => $chunk) {
|
||||
if ($chunk->embedding === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = sprintf(
|
||||
'doc_%s_chunk_%d',
|
||||
md5($document),
|
||||
$index
|
||||
);
|
||||
|
||||
$allVectorRecords[] = [
|
||||
'id' => $chunkId,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'source_file' => basename($document),
|
||||
'chunk_index' => $index,
|
||||
'chunk_length' => strlen($chunk->content),
|
||||
'embedding_model' => 'balanced',
|
||||
'created_at' => date('c'),
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nTotal records prepared: " . count($allVectorRecords) . "\n\n";
|
||||
|
||||
function simulateSemanticSearch(string $query, array $records, int $topK = 5): array
|
||||
{
|
||||
|
||||
echo "Simulating semantic search:\n";
|
||||
echo " Query: \"$query\"\n";
|
||||
echo " Searching " . count($records) . " vectors...\n";
|
||||
echo " Top $topK results:\n\n";
|
||||
|
||||
|
||||
$results = array_slice($records, 0, $topK);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo sprintf(
|
||||
" %d. %s (score: %.3f)\n",
|
||||
$index + 1,
|
||||
substr($result['content'], 0, 60) . '...',
|
||||
0.9 - ($index * 0.05)
|
||||
);
|
||||
echo sprintf(" Source: %s\n", $result['metadata']['source_file']);
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
if (!empty($allVectorRecords)) {
|
||||
echo "Semantic Search Example:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
simulateSemanticSearch(
|
||||
"How to configure document extraction?",
|
||||
$allVectorRecords,
|
||||
3
|
||||
);
|
||||
}
|
||||
|
||||
function exportVectorRecordsToJson(array $records, string $filename): void
|
||||
{
|
||||
$data = [
|
||||
'version' => '1.0',
|
||||
'count' => count($records),
|
||||
'generated_at' => date('c'),
|
||||
'records' => $records,
|
||||
];
|
||||
|
||||
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
|
||||
file_put_contents($filename, $json);
|
||||
|
||||
echo "Exported " . count($records) . " vector records to: $filename\n";
|
||||
}
|
||||
|
||||
if (!empty($allVectorRecords)) {
|
||||
exportVectorRecordsToJson($allVectorRecords, 'vector_records.json');
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user