This commit is contained in:
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
pages: new PageConfig({
|
||||
extract_pages: true,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.metadata && result.metadata.pages) {
|
||||
const pageStructure = result.metadata.pages;
|
||||
console.log(`Total pages: ${pageStructure.total_count}`);
|
||||
|
||||
if (pageStructure.boundaries) {
|
||||
// Iterate through page boundaries to map content to pages
|
||||
pageStructure.boundaries.forEach((boundary) => {
|
||||
const pageText = result.content.substring(
|
||||
boundary.byte_start,
|
||||
Math.min(boundary.byte_end, boundary.byte_start + 100),
|
||||
);
|
||||
|
||||
console.log(`Page ${boundary.page_number}:`);
|
||||
console.log(` Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
|
||||
console.log(` Preview: ${pageText}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user