This commit is contained in:
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithChunking() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 800,
|
||||
chunkOverlap: 150,
|
||||
splitOnNewlines: true,
|
||||
splitOnSentences: true,
|
||||
},
|
||||
};
|
||||
|
||||
// Example: prepend heading context so each chunk carries its heading breadcrumb
|
||||
const configWithHeadings: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
console.log(`\nChunk ${i}:`);
|
||||
console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
|
||||
console.log(`Content: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function extractWithPrependHeadingContext() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractWithChunking().catch(console.error);
|
||||
extractWithPrependHeadingContext().catch(console.error);
|
||||
Reference in New Issue
Block a user