docs/snippets/php/configuration/chunking_config.php

```php title="chunking_config.php"
<?php

declare(strict_types=1);

/**
 * Text Chunking Configuration
 *
 * This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)
 * applications. Chunking splits long documents into smaller, semantically meaningful segments.
 */

require_once __DIR__ . '/vendor/autoload.php';

use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;

echo "Example 1: Basic Chunking\n";
echo "=========================\n";

$config1 = new ExtractionConfig(
    chunking: new ChunkingConfig()
);

$kreuzberg = new Kreuzberg($config1);
$result = $kreuzberg->extractFile('long_document.pdf');

if ($result->chunks !== null) {
    echo "Total chunks: " . count($result->chunks) . "\n";
    foreach ($result->chunks as $i => $chunk) {
        echo "\nChunk {$i}:\n";
        echo "- Text length: {$chunk->metadata->charCount} characters\n";
        echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";
        if ($chunk->metadata->firstPage !== null) {
            echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";
        }
    }
}

echo "\n\n";

echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";
echo "======================================================================\n";

$config2 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        maxChunkSize: 256,      
        chunkOverlap: 25,       
        respectSentences: true, 
        respectParagraphs: false
    )
);

$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";

echo "Example 3: Large Chunks (More context per chunk)\n";
echo "================================================\n";

$config3 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        maxChunkSize: 2000,      
        chunkOverlap: 200,       
        respectSentences: true,  
        respectParagraphs: true  
    )
);

$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');
echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";

echo "Example 4: RAG-Optimized Configuration\n";
echo "=====================================\n";

$config4 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        maxChunkSize: 512,       
        chunkOverlap: 50,        
        respectSentences: true,  
        respectParagraphs: false 
    )
);

$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');

if ($result4->chunks !== null) {
    echo "Total chunks: " . count($result4->chunks) . "\n";

    $chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);
    echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";
    echo "Min chunk size: " . min($chunkSizes) . " characters\n";
    echo "Max chunk size: " . max($chunkSizes) . " characters\n";
}

echo "\n\n";

echo "Example 5: Processing Chunks for Vector Database\n";
echo "================================================\n";

$config5 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        maxChunkSize: 512,
        chunkOverlap: 50,
        respectSentences: true
    )
);

$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');

if ($result5->chunks !== null) {
    foreach ($result5->chunks as $i => $chunk) {
        $documentId = "doc_123";
        $chunkData = [
            'document_id' => $documentId,
            'chunk_index' => $i,
            'text' => $chunk->content,
            'char_count' => $chunk->metadata->charCount,
            'byte_start' => $chunk->metadata->byteStart,
            'byte_end' => $chunk->metadata->byteEnd,
            'page_range' => $chunk->metadata->firstPage !== null
                ? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
                : null,
        ];


        echo "Prepared chunk {$i} for database insertion\n";
    }
}

echo "\n\n";

echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";
echo "========================================================================\n";

$config6 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        chunkerType: 'markdown',
        sizing: [
            'type' => 'tokenizer',
            'model' => 'Xenova/gpt-4o'
        ]
    )
);

$result6 = (new Kreuzberg($config6))->extractFile('document.md');

if ($result6->chunks !== null) {
    echo "Total chunks: " . count($result6->chunks) . "\n";

    foreach ($result6->chunks as $i => $chunk) {
        echo "\nChunk {$i}:\n";
        echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";

        if (isset($chunk->metadata->headingContext->headings)) {
            $headings = $chunk->metadata->headingContext->headings;
            echo "- Headings in context:\n";
            foreach ($headings as $heading) {
                echo "  - Level {$heading->level}: {$heading->text}\n";
            }
        }
    }
}

echo "\n\nChunking Configuration Parameters:\n";
echo "==================================\n";
echo "- maxChunkSize: Maximum number of characters per chunk\n";
echo "- chunkOverlap: Number of overlapping characters between chunks\n";
echo "- respectSentences: Split at sentence boundaries when possible\n";
echo "- respectParagraphs: Split at paragraph boundaries when possible\n";
echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";
echo "- sizing: Sizing strategy configuration\n";
echo "  - type: 'character' or 'tokenizer'\n";
echo "  - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";
echo "\n\n";

echo "Example 7: Prepend Heading Context\n";
echo "====================================\n";

$config7 = new ExtractionConfig(
    chunking: new ChunkingConfig(
        chunkerType: 'markdown',
        prependHeadingContext: true
    )
);

$result7 = (new Kreuzberg($config7))->extractFile('document.md');

if ($result7->chunks !== null) {
    echo "Total chunks: " . count($result7->chunks) . "\n";

    foreach ($result7->chunks as $i => $chunk) {
        // Each chunk's content is prefixed with its heading breadcrumb,
        // e.g. "# Section > ## Subsection\n\nActual content..."
        echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";
    }
}

echo "\nBest Practices:\n";
echo "- Use 256-512 chars for fine-grained retrieval\n";
echo "- Use 1000-2000 chars for more context\n";
echo "- Set overlap to ~10% of chunk size\n";
echo "- Enable respectSentences for better coherence\n";
echo "- Use markdown chunker for structured documents with headings\n";
echo "- Use token-based sizing for LLM token budgets\n";
echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";
```
Nomad changes 2026-06-01 23:40:55 +02:00			```php title="chunking_config.php"
			`<?php`

			`declare(strict_types=1);`

			`/**`
			`* Text Chunking Configuration`
			`*`
			`* This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)`
			`* applications. Chunking splits long documents into smaller, semantically meaningful segments.`
			`*/`

			`require_once __DIR__ . '/vendor/autoload.php';`

			`use Kreuzberg\Kreuzberg;`
			`use Kreuzberg\Config\ExtractionConfig;`
			`use Kreuzberg\Config\ChunkingConfig;`

			`echo "Example 1: Basic Chunking\n";`
			`echo "=========================\n";`

			`$config1 = new ExtractionConfig(`
			`chunking: new ChunkingConfig()`
			`);`

			`$kreuzberg = new Kreuzberg($config1);`
			`$result = $kreuzberg->extractFile('long_document.pdf');`

			`if ($result->chunks !== null) {`
			`echo "Total chunks: " . count($result->chunks) . "\n";`
			`foreach ($result->chunks as $i => $chunk) {`
			`echo "\nChunk {$i}:\n";`
			`echo "- Text length: {$chunk->metadata->charCount} characters\n";`
			`echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";`
			`if ($chunk->metadata->firstPage !== null) {`
			`echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";`
			`}`
			`}`
			`}`

			`echo "\n\n";`

			`echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";`
			`echo "======================================================================\n";`

			`$config2 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`maxChunkSize: 256,`
			`chunkOverlap: 25,`
			`respectSentences: true,`
			`respectParagraphs: false`
			`)`
			`);`

			`$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');`
			`echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";`

			`echo "Example 3: Large Chunks (More context per chunk)\n";`
			`echo "================================================\n";`

			`$config3 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`maxChunkSize: 2000,`
			`chunkOverlap: 200,`
			`respectSentences: true,`
			`respectParagraphs: true`
			`)`
			`);`

			`$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');`
			`echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";`

			`echo "Example 4: RAG-Optimized Configuration\n";`
			`echo "=====================================\n";`

			`$config4 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`maxChunkSize: 512,`
			`chunkOverlap: 50,`
			`respectSentences: true,`
			`respectParagraphs: false`
			`)`
			`);`

			`$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');`

			`if ($result4->chunks !== null) {`
			`echo "Total chunks: " . count($result4->chunks) . "\n";`

			`$chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);`
			`echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";`
			`echo "Min chunk size: " . min($chunkSizes) . " characters\n";`
			`echo "Max chunk size: " . max($chunkSizes) . " characters\n";`
			`}`

			`echo "\n\n";`

			`echo "Example 5: Processing Chunks for Vector Database\n";`
			`echo "================================================\n";`

			`$config5 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`maxChunkSize: 512,`
			`chunkOverlap: 50,`
			`respectSentences: true`
			`)`
			`);`

			`$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');`

			`if ($result5->chunks !== null) {`
			`foreach ($result5->chunks as $i => $chunk) {`
			`$documentId = "doc_123";`
			`$chunkData = [`
			`'document_id' => $documentId,`
			`'chunk_index' => $i,`
			`'text' => $chunk->content,`
			`'char_count' => $chunk->metadata->charCount,`
			`'byte_start' => $chunk->metadata->byteStart,`
			`'byte_end' => $chunk->metadata->byteEnd,`
			`'page_range' => $chunk->metadata->firstPage !== null`
			`? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"`
			`: null,`
			`];`


			`echo "Prepared chunk {$i} for database insertion\n";`
			`}`
			`}`

			`echo "\n\n";`

			`echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";`
			`echo "========================================================================\n";`

			`$config6 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`chunkerType: 'markdown',`
			`sizing: [`
			`'type' => 'tokenizer',`
			`'model' => 'Xenova/gpt-4o'`
			`]`
			`)`
			`);`

			`$result6 = (new Kreuzberg($config6))->extractFile('document.md');`

			`if ($result6->chunks !== null) {`
			`echo "Total chunks: " . count($result6->chunks) . "\n";`

			`foreach ($result6->chunks as $i => $chunk) {`
			`echo "\nChunk {$i}:\n";`
			`echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";`

			`if (isset($chunk->metadata->headingContext->headings)) {`
			`$headings = $chunk->metadata->headingContext->headings;`
			`echo "- Headings in context:\n";`
			`foreach ($headings as $heading) {`
			`echo " - Level {$heading->level}: {$heading->text}\n";`
			`}`
			`}`
			`}`
			`}`

			`echo "\n\nChunking Configuration Parameters:\n";`
			`echo "==================================\n";`
			`echo "- maxChunkSize: Maximum number of characters per chunk\n";`
			`echo "- chunkOverlap: Number of overlapping characters between chunks\n";`
			`echo "- respectSentences: Split at sentence boundaries when possible\n";`
			`echo "- respectParagraphs: Split at paragraph boundaries when possible\n";`
			`echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";`
			`echo "- sizing: Sizing strategy configuration\n";`
			`echo " - type: 'character' or 'tokenizer'\n";`
			`echo " - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";`
			`echo "\n\n";`

			`echo "Example 7: Prepend Heading Context\n";`
			`echo "====================================\n";`

			`$config7 = new ExtractionConfig(`
			`chunking: new ChunkingConfig(`
			`chunkerType: 'markdown',`
			`prependHeadingContext: true`
			`)`
			`);`

			`$result7 = (new Kreuzberg($config7))->extractFile('document.md');`

			`if ($result7->chunks !== null) {`
			`echo "Total chunks: " . count($result7->chunks) . "\n";`

			`foreach ($result7->chunks as $i => $chunk) {`
			`// Each chunk's content is prefixed with its heading breadcrumb,`
			`// e.g. "# Section > ## Subsection\n\nActual content..."`
			`echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";`
			`}`
			`}`

			`echo "\nBest Practices:\n";`
			`echo "- Use 256-512 chars for fine-grained retrieval\n";`
			`echo "- Use 1000-2000 chars for more context\n";`
			`echo "- Set overlap to ~10% of chunk size\n";`
			`echo "- Enable respectSentences for better coherence\n";`
			`echo "- Use markdown chunker for structured documents with headings\n";`
			`echo "- Use token-based sizing for LLM token budgets\n";`
			`echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";`
			```