51 lines
1.1 KiB
TypeScript
51 lines
1.1 KiB
TypeScript
|
|
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||
|
|
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||
|
|
|
||
|
|
interface ProcessingStep {
|
||
|
|
name: string;
|
||
|
|
process: (result: ExtractionResult) => Promise<ExtractionResult>;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function createExtractionPipeline(
|
||
|
|
steps: ProcessingStep[],
|
||
|
|
bytes: Uint8Array,
|
||
|
|
mimeType: string,
|
||
|
|
) {
|
||
|
|
await initWasm();
|
||
|
|
|
||
|
|
let result = await extractBytes(bytes, mimeType);
|
||
|
|
|
||
|
|
for (const step of steps) {
|
||
|
|
console.log(`Executing step: ${step.name}`);
|
||
|
|
result = await step.process(result);
|
||
|
|
}
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
const pipeline: ProcessingStep[] = [
|
||
|
|
{
|
||
|
|
name: "Text Normalization",
|
||
|
|
process: async (result) => ({
|
||
|
|
...result,
|
||
|
|
content: result.content.replace(/\s+/g, " ").trim(),
|
||
|
|
}),
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "Language Detection",
|
||
|
|
process: async (result) => result,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "Chunking",
|
||
|
|
process: async (result) => result,
|
||
|
|
},
|
||
|
|
];
|
||
|
|
|
||
|
|
createExtractionPipeline(
|
||
|
|
pipeline,
|
||
|
|
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
|
||
|
|
"application/pdf",
|
||
|
|
)
|
||
|
|
.then((r) => console.log("Pipeline complete:", r))
|
||
|
|
.catch(console.error);
|