```php title="basic_chunking.php" extractFile('long_document.pdf'); echo "Document Chunking Results:\n"; echo str_repeat('=', 60) . "\n"; echo "Total chunks: " . count($result->chunks ?? []) . "\n"; echo "Total content length: " . strlen($result->content) . "\n\n"; foreach ($result->chunks ?? [] as $chunk) { echo "Chunk {$chunk->metadata->chunkIndex}:\n"; echo str_repeat('-', 60) . "\n"; echo "Length: " . strlen($chunk->content) . " chars\n"; echo "Content: " . substr($chunk->content, 0, 100) . "...\n\n"; } $sizes = [ 'Small (256)' => 256, 'Medium (512)' => 512, 'Large (1024)' => 1024, 'XLarge (2048)' => 2048, ]; foreach ($sizes as $name => $size) { $config = new ExtractionConfig( chunking: new ChunkingConfig( maxChunkSize: $size, chunkOverlap: (int)($size * 0.1) ) ); $kreuzberg = new Kreuzberg($config); $result = $kreuzberg->extractFile('document.pdf'); echo "$name chunks:\n"; echo " Total: " . count($result->chunks ?? []) . "\n"; echo " Avg size: " . number_format( array_sum(array_map( fn($c) => strlen($c->content), $result->chunks ?? [] )) / count($result->chunks ?? [1]) ) . " chars\n\n"; } $sentenceConfig = new ExtractionConfig( chunking: new ChunkingConfig( maxChunkSize: 512, chunkOverlap: 50, respectSentences: true, respectParagraphs: false ) ); $kreuzberg = new Kreuzberg($sentenceConfig); $result = $kreuzberg->extractFile('article.pdf'); echo "Sentence-respecting chunks:\n"; echo str_repeat('=', 60) . "\n"; foreach ($result->chunks ?? [] as $chunk) { $sentences = preg_match_all('/[.!?]+/', $chunk->content); echo "Chunk {$chunk->metadata->chunkIndex}: $sentences sentences\n"; echo " Starts with: " . substr($chunk->content, 0, 50) . "...\n"; echo " Ends with: ..." . substr($chunk->content, -50) . "\n\n"; } $paragraphConfig = new ExtractionConfig( chunking: new ChunkingConfig( maxChunkSize: 1000, chunkOverlap: 100, respectSentences: true, respectParagraphs: true ) ); $kreuzberg = new Kreuzberg($paragraphConfig); $result = $kreuzberg->extractFile('essay.pdf'); echo "Paragraph-respecting chunks:\n"; echo str_repeat('=', 60) . "\n"; foreach ($result->chunks ?? [] as $chunk) { $paragraphs = substr_count($chunk->content, "\n\n"); echo "Chunk {$chunk->metadata->chunkIndex}: ~$paragraphs paragraphs\n"; echo " " . strlen($chunk->content) . " characters\n\n"; } $config = new ExtractionConfig( chunking: new ChunkingConfig( maxChunkSize: 512, chunkOverlap: 50, respectSentences: true ) ); $kreuzberg = new Kreuzberg($config); $result = $kreuzberg->extractFile('knowledge_base.pdf'); $chunksForDb = []; foreach ($result->chunks ?? [] as $chunk) { $chunksForDb[] = [ 'id' => uniqid('chunk_', true), 'document_id' => 'doc_' . md5($result->content), 'chunk_index' => $chunk->metadata->chunkIndex, 'content' => $chunk->content, 'length' => strlen($chunk->content), 'metadata' => [ 'source_file' => 'knowledge_base.pdf', 'mime_type' => $result->mimeType, 'created_at' => date('Y-m-d H:i:s'), ], ]; } echo "Prepared " . count($chunksForDb) . " chunks for database:\n"; foreach (array_slice($chunksForDb, 0, 3) as $chunk) { echo " ID: {$chunk['id']}\n"; echo " Index: {$chunk['chunk_index']}\n"; echo " Length: {$chunk['length']} chars\n\n"; } file_put_contents( 'chunks.json', json_encode($chunksForDb, JSON_PRETTY_PRINT) ); echo "Saved chunks to: chunks.json\n"; ```