30 lines
733 B
TypeScript
30 lines
733 B
TypeScript
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
|
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
|
|
async function extractWithConfig() {
|
|
await initWasm();
|
|
|
|
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
|
|
|
const config: ExtractionConfig = {
|
|
ocr: {
|
|
backend: "tesseract-wasm",
|
|
language: "eng",
|
|
},
|
|
images: {
|
|
extractImages: true,
|
|
targetDpi: 200,
|
|
},
|
|
chunking: {
|
|
maxChars: 1000,
|
|
chunkOverlap: 100,
|
|
},
|
|
};
|
|
|
|
const result = await extractBytes(bytes, "application/pdf", config);
|
|
console.log("Extraction complete");
|
|
console.log("Content length:", result.content.length);
|
|
}
|
|
|
|
extractWithConfig().catch(console.error);
|