81 lines
2.2 KiB
Markdown
81 lines
2.2 KiB
Markdown
```typescript title="WASM - Chunking for RAG Pipeline"
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
|
|
|
await init();
|
|
|
|
const config = {
|
|
chunking: {
|
|
maxChars: 512, // Smaller chunks for vector DB efficiency
|
|
chunkOverlap: 100,
|
|
trim: true,
|
|
},
|
|
includeDocumentStructure: true,
|
|
};
|
|
|
|
const bytes = new Uint8Array(buffer);
|
|
const result = await extractBytes(bytes, "application/pdf", config);
|
|
|
|
// Prepare chunks for vector database ingestion
|
|
const ragChunks =
|
|
result.chunks?.map((chunk, idx) => ({
|
|
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
|
|
text: chunk.content,
|
|
metadata: {
|
|
source: result.metadata?.filename,
|
|
chunk_index: chunk.metadata?.chunkIndex,
|
|
total_chunks: chunk.metadata?.totalChunks,
|
|
first_page: chunk.metadata?.firstPage,
|
|
last_page: chunk.metadata?.lastPage,
|
|
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
|
|
},
|
|
// Embedding would be added by vector DB embedding model
|
|
})) || [];
|
|
|
|
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
|
|
ragChunks.slice(0, 3).forEach((c) => {
|
|
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
|
|
});
|
|
|
|
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
|
|
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
|
|
// id: c.id,
|
|
// values: await embedModel.embed(c.text),
|
|
// metadata: c.metadata,
|
|
// })));
|
|
```
|
|
|
|
```typescript title="WASM - RAG with Retrieval Context"
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
|
|
|
await init();
|
|
|
|
const config = {
|
|
chunking: {
|
|
maxChars: 768,
|
|
chunkOverlap: 200,
|
|
prependHeadingContext: true, // For markdown/docs
|
|
},
|
|
};
|
|
|
|
const bytes = new Uint8Array(buffer);
|
|
const result = await extractBytes(bytes, "text/markdown", config);
|
|
|
|
// Build retrieval augmented context
|
|
interface RagDocument {
|
|
id: string;
|
|
query_text: string;
|
|
context: string;
|
|
page: number | null;
|
|
}
|
|
|
|
const ragDocs: RagDocument[] =
|
|
result.chunks?.map((chunk, idx) => ({
|
|
id: `chunk_${idx}`,
|
|
query_text: chunk.content, // Text to embed and search
|
|
context: chunk.content, // Full context (includes heading)
|
|
page: chunk.metadata?.firstPage ?? null,
|
|
})) || [];
|
|
|
|
console.log(`Built RAG documents: ${ragDocs.length}`);
|
|
```
|