37 lines
1.1 KiB
Markdown
37 lines
1.1 KiB
Markdown
|
|
```typescript title="WASM"
|
||
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
||
|
|
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||
|
|
|
||
|
|
await init();
|
||
|
|
|
||
|
|
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||
|
|
const mimeType = "application/pdf";
|
||
|
|
|
||
|
|
const config = new ExtractionConfig({
|
||
|
|
pages: new PageConfig({
|
||
|
|
extract_pages: true,
|
||
|
|
}),
|
||
|
|
});
|
||
|
|
|
||
|
|
const result = await extractBytes(fileBuffer, mimeType, config);
|
||
|
|
|
||
|
|
if (result.metadata && result.metadata.pages) {
|
||
|
|
const pageStructure = result.metadata.pages;
|
||
|
|
console.log(`Total pages: ${pageStructure.total_count}`);
|
||
|
|
|
||
|
|
if (pageStructure.boundaries) {
|
||
|
|
// Iterate through page boundaries to map content to pages
|
||
|
|
pageStructure.boundaries.forEach((boundary) => {
|
||
|
|
const pageText = result.content.substring(
|
||
|
|
boundary.byte_start,
|
||
|
|
Math.min(boundary.byte_end, boundary.byte_start + 100),
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`Page ${boundary.page_number}:`);
|
||
|
|
console.log(` Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
|
||
|
|
console.log(` Preview: ${pageText}...`);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
```
|