This commit is contained in:
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1500,
|
||||
chunkOverlap: 300,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Map chunks back to pages for source attribution
|
||||
const chunkPageMap = new Map<number, number[]>();
|
||||
|
||||
result.chunks?.forEach((chunk, chunkIndex) => {
|
||||
const firstPage = chunk.metadata?.firstPage;
|
||||
const lastPage = chunk.metadata?.lastPage;
|
||||
|
||||
if (firstPage !== undefined && lastPage !== undefined) {
|
||||
for (let page = firstPage; page <= lastPage; page++) {
|
||||
if (!chunkPageMap.has(page)) {
|
||||
chunkPageMap.set(page, []);
|
||||
}
|
||||
chunkPageMap.get(page)!.push(chunkIndex);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Use the mapping for source attribution
|
||||
chunkPageMap.forEach((chunkIndices, pageNum) => {
|
||||
console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
|
||||
chunkIndices.forEach((idx) => {
|
||||
const chunk = result.chunks![idx];
|
||||
console.log(` Content: "${chunk.content.substring(0, 60)}..."`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Snippet:syntax-only** - Requires document structure parsing during extraction.
|
||||
Reference in New Issue
Block a user