Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
<!-- snippet:syntax-only -->
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Cloud OCR backends are not supported in WASM directly.
// WASM runs in a sandboxed environment without direct network access.
// To use cloud OCR services, implement a wrapper on your server
// or use a cloud platform with built-in OCR integration.
const cloudOcrConfig = {
ocr: {
backend: "custom", // Custom backends must be registered via native runtime
language: "eng",
},
};
// This example shows the configuration structure.
// In production, route cloud OCR requests through your backend service.
```

View File

@@ -0,0 +1,27 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithOcr() {
await initWasm();
try {
await enableOcr();
console.log("OCR enabled successfully");
} catch (error) {
console.error("Failed to enable OCR:", error);
return;
}
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
});
console.log("Extracted text:");
console.log(result.content);
}
extractWithOcr().catch(console.error);

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const imageData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(imageData, "application/pdf", {
images: {
extract_images: true,
},
});
console.log(result.images);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
images: {
extract_images: true,
target_dpi: 300,
max_image_dimension: 2000,
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,29 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractMultilingualDocument() {
await initWasm();
await enableOcr();
const documents = [
{ name: "english.png", lang: "eng" },
{ name: "german.png", lang: "deu" },
{ name: "spanish.png", lang: "spa" },
];
for (const doc of documents) {
const bytes = new Uint8Array(await fetch(doc.name).then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: doc.lang,
},
});
console.log(`${doc.name} (${doc.lang}):`);
console.log(result.content);
console.log("---");
}
}
extractMultilingualDocument().catch(console.error);

View File

@@ -0,0 +1,30 @@
import {
initWasm,
listOcrBackends,
registerOcrBackend,
TesseractWasmBackend,
unregisterOcrBackend,
} from "@kreuzberg/wasm";
async function manageOcrBackends() {
await initWasm();
const backend = new TesseractWasmBackend();
await backend.initialize();
registerOcrBackend(backend);
const backends = listOcrBackends();
console.log("Available OCR backends:", backends);
if (backends.includes("tesseract-wasm")) {
console.log("Tesseract WASM backend is registered");
}
unregisterOcrBackend("tesseract-wasm");
const afterUnregister = listOcrBackends();
console.log("Backends after unregister:", afterUnregister);
}
manageOcrBackends().catch(console.error);

View File

@@ -0,0 +1,37 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithErrorHandling() {
try {
await initWasm();
} catch (error) {
console.error("Failed to initialize WASM:", error);
return;
}
try {
await enableOcr();
} catch (error) {
if (error instanceof Error && error.message.includes("browser")) {
console.warn("OCR not available in this environment, proceeding without OCR");
} else {
throw error;
}
}
try {
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
});
console.log("Extraction successful:", result.content.length, "chars");
} catch (error) {
console.error("Extraction failed:", error);
}
}
extractWithErrorHandling().catch(console.error);

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// EasyOCR backend is not supported in WASM.
// EasyOCR requires PyTorch and Python runtime, which are unavailable in browser/WASM.
// Use the Tesseract-WASM backend instead, or route requests through a backend service.
const easyOcrConfig = {
ocr: {
backend: "easyocr", // Not supported in WASM
language: "en",
},
};
// This example shows the configuration structure for reference only.
```

View File

@@ -0,0 +1,24 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("scanned.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
ocr: {
backend: "tesseract",
language: "eng",
element_config: {
include_elements: true,
},
},
});
if (result.ocr_elements) {
for (const element of result.ocr_elements) {
console.log("Text:", element.text);
console.log("Confidence:", element.confidence);
}
}
```

View File

@@ -0,0 +1,34 @@
```typescript title="WASM (Browser)"
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
}
```
```typescript title="WASM (Node.js / Deno / Bun)"
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend
const result = await extractFile("./scanned_document.png", "image/png", {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
force_ocr: true,
ocr: {
backend: "tesseract",
language: "eng",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,16 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const multilingualData = await fetch("multilingual.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(multilingualData, "application/pdf", {
ocr: {
backend: "tesseract",
language: "eng+deu+fra",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// PaddleOCR backend is not supported in WASM.
// PaddleOCR requires ONNX Runtime and native C++ dependencies unavailable in browser/WASM.
// Use the Tesseract-WASM backend instead, or implement a backend wrapper service.
const paddleOcrConfig = {
ocr: {
backend: "paddleocr", // Not supported in WASM
language: "en",
},
};
// This example shows the configuration structure for reference only.
```

View File

@@ -0,0 +1,32 @@
import { extractBytes, initWasm, registerOcrBackend, TesseractWasmBackend } from "@kreuzberg/wasm";
async function extractWithProgressTracking() {
await initWasm();
const backend = new TesseractWasmBackend();
backend.setProgressCallback((progress: number) => {
const progressBar = document.getElementById("progress");
if (progressBar) {
progressBar.style.width = `${progress}%`;
progressBar.textContent = `${progress}%`;
}
});
await backend.initialize();
registerOcrBackend(backend);
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
});
console.log("OCR complete");
console.log(result.content);
}
extractWithProgressTracking().catch(console.error);