208 lines
6.7 KiB
PHP
208 lines
6.7 KiB
PHP
```php title="chunking_config.php"
|
|
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
/**
|
|
* Text Chunking Configuration
|
|
*
|
|
* This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)
|
|
* applications. Chunking splits long documents into smaller, semantically meaningful segments.
|
|
*/
|
|
|
|
require_once __DIR__ . '/vendor/autoload.php';
|
|
|
|
use Kreuzberg\Kreuzberg;
|
|
use Kreuzberg\Config\ExtractionConfig;
|
|
use Kreuzberg\Config\ChunkingConfig;
|
|
|
|
echo "Example 1: Basic Chunking\n";
|
|
echo "=========================\n";
|
|
|
|
$config1 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig()
|
|
);
|
|
|
|
$kreuzberg = new Kreuzberg($config1);
|
|
$result = $kreuzberg->extractFile('long_document.pdf');
|
|
|
|
if ($result->chunks !== null) {
|
|
echo "Total chunks: " . count($result->chunks) . "\n";
|
|
foreach ($result->chunks as $i => $chunk) {
|
|
echo "\nChunk {$i}:\n";
|
|
echo "- Text length: {$chunk->metadata->charCount} characters\n";
|
|
echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";
|
|
if ($chunk->metadata->firstPage !== null) {
|
|
echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "\n\n";
|
|
|
|
echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";
|
|
echo "======================================================================\n";
|
|
|
|
$config2 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
maxChunkSize: 256,
|
|
chunkOverlap: 25,
|
|
respectSentences: true,
|
|
respectParagraphs: false
|
|
)
|
|
);
|
|
|
|
$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
|
|
echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";
|
|
|
|
echo "Example 3: Large Chunks (More context per chunk)\n";
|
|
echo "================================================\n";
|
|
|
|
$config3 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
maxChunkSize: 2000,
|
|
chunkOverlap: 200,
|
|
respectSentences: true,
|
|
respectParagraphs: true
|
|
)
|
|
);
|
|
|
|
$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');
|
|
echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";
|
|
|
|
echo "Example 4: RAG-Optimized Configuration\n";
|
|
echo "=====================================\n";
|
|
|
|
$config4 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
maxChunkSize: 512,
|
|
chunkOverlap: 50,
|
|
respectSentences: true,
|
|
respectParagraphs: false
|
|
)
|
|
);
|
|
|
|
$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');
|
|
|
|
if ($result4->chunks !== null) {
|
|
echo "Total chunks: " . count($result4->chunks) . "\n";
|
|
|
|
$chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);
|
|
echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";
|
|
echo "Min chunk size: " . min($chunkSizes) . " characters\n";
|
|
echo "Max chunk size: " . max($chunkSizes) . " characters\n";
|
|
}
|
|
|
|
echo "\n\n";
|
|
|
|
echo "Example 5: Processing Chunks for Vector Database\n";
|
|
echo "================================================\n";
|
|
|
|
$config5 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
maxChunkSize: 512,
|
|
chunkOverlap: 50,
|
|
respectSentences: true
|
|
)
|
|
);
|
|
|
|
$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
|
|
|
|
if ($result5->chunks !== null) {
|
|
foreach ($result5->chunks as $i => $chunk) {
|
|
$documentId = "doc_123";
|
|
$chunkData = [
|
|
'document_id' => $documentId,
|
|
'chunk_index' => $i,
|
|
'text' => $chunk->content,
|
|
'char_count' => $chunk->metadata->charCount,
|
|
'byte_start' => $chunk->metadata->byteStart,
|
|
'byte_end' => $chunk->metadata->byteEnd,
|
|
'page_range' => $chunk->metadata->firstPage !== null
|
|
? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
|
|
: null,
|
|
];
|
|
|
|
|
|
echo "Prepared chunk {$i} for database insertion\n";
|
|
}
|
|
}
|
|
|
|
echo "\n\n";
|
|
|
|
echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";
|
|
echo "========================================================================\n";
|
|
|
|
$config6 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
chunkerType: 'markdown',
|
|
sizing: [
|
|
'type' => 'tokenizer',
|
|
'model' => 'Xenova/gpt-4o'
|
|
]
|
|
)
|
|
);
|
|
|
|
$result6 = (new Kreuzberg($config6))->extractFile('document.md');
|
|
|
|
if ($result6->chunks !== null) {
|
|
echo "Total chunks: " . count($result6->chunks) . "\n";
|
|
|
|
foreach ($result6->chunks as $i => $chunk) {
|
|
echo "\nChunk {$i}:\n";
|
|
echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";
|
|
|
|
if (isset($chunk->metadata->headingContext->headings)) {
|
|
$headings = $chunk->metadata->headingContext->headings;
|
|
echo "- Headings in context:\n";
|
|
foreach ($headings as $heading) {
|
|
echo " - Level {$heading->level}: {$heading->text}\n";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
echo "\n\nChunking Configuration Parameters:\n";
|
|
echo "==================================\n";
|
|
echo "- maxChunkSize: Maximum number of characters per chunk\n";
|
|
echo "- chunkOverlap: Number of overlapping characters between chunks\n";
|
|
echo "- respectSentences: Split at sentence boundaries when possible\n";
|
|
echo "- respectParagraphs: Split at paragraph boundaries when possible\n";
|
|
echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";
|
|
echo "- sizing: Sizing strategy configuration\n";
|
|
echo " - type: 'character' or 'tokenizer'\n";
|
|
echo " - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";
|
|
echo "\n\n";
|
|
|
|
echo "Example 7: Prepend Heading Context\n";
|
|
echo "====================================\n";
|
|
|
|
$config7 = new ExtractionConfig(
|
|
chunking: new ChunkingConfig(
|
|
chunkerType: 'markdown',
|
|
prependHeadingContext: true
|
|
)
|
|
);
|
|
|
|
$result7 = (new Kreuzberg($config7))->extractFile('document.md');
|
|
|
|
if ($result7->chunks !== null) {
|
|
echo "Total chunks: " . count($result7->chunks) . "\n";
|
|
|
|
foreach ($result7->chunks as $i => $chunk) {
|
|
// Each chunk's content is prefixed with its heading breadcrumb,
|
|
// e.g. "# Section > ## Subsection\n\nActual content..."
|
|
echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";
|
|
}
|
|
}
|
|
|
|
echo "\nBest Practices:\n";
|
|
echo "- Use 256-512 chars for fine-grained retrieval\n";
|
|
echo "- Use 1000-2000 chars for more context\n";
|
|
echo "- Set overlap to ~10% of chunk size\n";
|
|
echo "- Enable respectSentences for better coherence\n";
|
|
echo "- Use markdown chunker for structured documents with headings\n";
|
|
echo "- Use token-based sizing for LLM token budgets\n";
|
|
echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";
|
|
```
|