Files
fil/docs/snippets/php/utils/chunking_rag.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

81 lines
2.3 KiB
PHP

```php title="chunking_rag.php"
<?php
declare(strict_types=1);
/**
* Chunking for RAG (Retrieval-Augmented Generation)
*
* Advanced chunking configuration optimized for RAG systems with embeddings.
* Demonstrates how to process documents into chunks with embeddings for
* vector database storage and semantic search.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 500,
maxOverlap: 50,
embedding: new EmbeddingConfig(
model: 'balanced',
normalize: true,
batchSize: 16
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('research_paper.pdf');
echo "RAG Chunking Results:\n";
echo str_repeat('=', 60) . "\n";
$chunksWithEmbeddings = [];
foreach ($result->chunks ?? [] as $chunk) {
if ($chunk->embedding !== null) {
$chunksWithEmbeddings[] = [
'content' => substr($chunk->content, 0, 100) . '...',
'embedding_dims' => count($chunk->embedding),
'full_content' => $chunk->content,
'embedding' => $chunk->embedding,
];
}
}
echo "Chunks with embeddings: " . count($chunksWithEmbeddings) . "\n\n";
echo "Sample chunks for vector database:\n";
echo str_repeat('=', 60) . "\n";
foreach (array_slice($chunksWithEmbeddings, 0, 3) as $index => $chunk) {
echo "Chunk " . ($index + 1) . ":\n";
echo " Content preview: {$chunk['content']}\n";
echo " Embedding dimensions: {$chunk['embedding_dims']}\n";
echo " Ready for vector DB: Yes\n\n";
}
$vectorDbRecords = array_map(
fn($chunk, $idx) => [
'id' => sprintf('doc_%s_chunk_%d', md5('research_paper.pdf'), $idx),
'content' => $chunk['full_content'],
'embedding' => $chunk['embedding'],
'metadata' => [
'source' => 'research_paper.pdf',
'chunk_index' => $idx,
'char_count' => strlen($chunk['full_content']),
],
],
$chunksWithEmbeddings,
array_keys($chunksWithEmbeddings)
);
echo "Prepared " . count($vectorDbRecords) . " records for vector database\n";
echo "Each record contains: id, content, embedding, and metadata\n";
```