Files
fil/docs/snippets/wasm/advanced/chunking_rag.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.2 KiB

import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  chunking: {
    maxChars: 512, // Smaller chunks for vector DB efficiency
    chunkOverlap: 100,
    trim: true,
  },
  includeDocumentStructure: true,
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

// Prepare chunks for vector database ingestion
const ragChunks =
  result.chunks?.map((chunk, idx) => ({
    id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
    text: chunk.content,
    metadata: {
      source: result.metadata?.filename,
      chunk_index: chunk.metadata?.chunkIndex,
      total_chunks: chunk.metadata?.totalChunks,
      first_page: chunk.metadata?.firstPage,
      last_page: chunk.metadata?.lastPage,
      byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
    },
    // Embedding would be added by vector DB embedding model
  })) || [];

console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
ragChunks.slice(0, 3).forEach((c) => {
  console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
});

// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
//   id: c.id,
//   values: await embedModel.embed(c.text),
//   metadata: c.metadata,
// })));
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  chunking: {
    maxChars: 768,
    chunkOverlap: 200,
    prependHeadingContext: true, // For markdown/docs
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);

// Build retrieval augmented context
interface RagDocument {
  id: string;
  query_text: string;
  context: string;
  page: number | null;
}

const ragDocs: RagDocument[] =
  result.chunks?.map((chunk, idx) => ({
    id: `chunk_${idx}`,
    query_text: chunk.content, // Text to embed and search
    context: chunk.content, // Full context (includes heading)
    page: chunk.metadata?.firstPage ?? null,
  })) || [];

console.log(`Built RAG documents: ${ragDocs.length}`);