2.2 KiB
2.2 KiB
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 512, // Smaller chunks for vector DB efficiency
chunkOverlap: 100,
trim: true,
},
includeDocumentStructure: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Prepare chunks for vector database ingestion
const ragChunks =
result.chunks?.map((chunk, idx) => ({
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
text: chunk.content,
metadata: {
source: result.metadata?.filename,
chunk_index: chunk.metadata?.chunkIndex,
total_chunks: chunk.metadata?.totalChunks,
first_page: chunk.metadata?.firstPage,
last_page: chunk.metadata?.lastPage,
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
},
// Embedding would be added by vector DB embedding model
})) || [];
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
ragChunks.slice(0, 3).forEach((c) => {
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
});
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
// id: c.id,
// values: await embedModel.embed(c.text),
// metadata: c.metadata,
// })));
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 768,
chunkOverlap: 200,
prependHeadingContext: true, // For markdown/docs
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
// Build retrieval augmented context
interface RagDocument {
id: string;
query_text: string;
context: string;
page: number | null;
}
const ragDocs: RagDocument[] =
result.chunks?.map((chunk, idx) => ({
id: `chunk_${idx}`,
query_text: chunk.content, // Text to embed and search
context: chunk.content, // Full context (includes heading)
page: chunk.metadata?.firstPage ?? null,
})) || [];
console.log(`Built RAG documents: ${ragDocs.length}`);