This commit is contained in:
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```typescript title="WASM - Chunking for RAG Pipeline"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 512, // Smaller chunks for vector DB efficiency
|
||||
chunkOverlap: 100,
|
||||
trim: true,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Prepare chunks for vector database ingestion
|
||||
const ragChunks =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
|
||||
text: chunk.content,
|
||||
metadata: {
|
||||
source: result.metadata?.filename,
|
||||
chunk_index: chunk.metadata?.chunkIndex,
|
||||
total_chunks: chunk.metadata?.totalChunks,
|
||||
first_page: chunk.metadata?.firstPage,
|
||||
last_page: chunk.metadata?.lastPage,
|
||||
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
|
||||
},
|
||||
// Embedding would be added by vector DB embedding model
|
||||
})) || [];
|
||||
|
||||
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
|
||||
ragChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
|
||||
});
|
||||
|
||||
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
|
||||
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
|
||||
// id: c.id,
|
||||
// values: await embedModel.embed(c.text),
|
||||
// metadata: c.metadata,
|
||||
// })));
|
||||
```
|
||||
|
||||
```typescript title="WASM - RAG with Retrieval Context"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 768,
|
||||
chunkOverlap: 200,
|
||||
prependHeadingContext: true, // For markdown/docs
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
// Build retrieval augmented context
|
||||
interface RagDocument {
|
||||
id: string;
|
||||
query_text: string;
|
||||
context: string;
|
||||
page: number | null;
|
||||
}
|
||||
|
||||
const ragDocs: RagDocument[] =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `chunk_${idx}`,
|
||||
query_text: chunk.content, // Text to embed and search
|
||||
context: chunk.content, // Full context (includes heading)
|
||||
page: chunk.metadata?.firstPage ?? null,
|
||||
})) || [];
|
||||
|
||||
console.log(`Built RAG documents: ${ragDocs.length}`);
|
||||
```
|
||||
Reference in New Issue
Block a user