This commit is contained in:
22
docs/snippets/wasm/ocr/cloud_ocr_backend.md
Normal file
22
docs/snippets/wasm/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,22 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Cloud OCR backends are not supported in WASM directly.
|
||||
// WASM runs in a sandboxed environment without direct network access.
|
||||
// To use cloud OCR services, implement a wrapper on your server
|
||||
// or use a cloud platform with built-in OCR integration.
|
||||
|
||||
const cloudOcrConfig = {
|
||||
ocr: {
|
||||
backend: "custom", // Custom backends must be registered via native runtime
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
// This example shows the configuration structure.
|
||||
// In production, route cloud OCR requests through your backend service.
|
||||
```
|
||||
27
docs/snippets/wasm/ocr/enable-ocr.ts
Normal file
27
docs/snippets/wasm/ocr/enable-ocr.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithOcr() {
|
||||
await initWasm();
|
||||
|
||||
try {
|
||||
await enableOcr();
|
||||
console.log("OCR enabled successfully");
|
||||
} catch (error) {
|
||||
console.error("Failed to enable OCR:", error);
|
||||
return;
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log("Extracted text:");
|
||||
console.log(result.content);
|
||||
}
|
||||
|
||||
extractWithOcr().catch(console.error);
|
||||
15
docs/snippets/wasm/ocr/image_extraction.md
Normal file
15
docs/snippets/wasm/ocr/image_extraction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const imageData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(imageData, "application/pdf", {
|
||||
images: {
|
||||
extract_images: true,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.images);
|
||||
```
|
||||
17
docs/snippets/wasm/ocr/image_preprocessing.md
Normal file
17
docs/snippets/wasm/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
images: {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 2000,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
29
docs/snippets/wasm/ocr/multi-language-ocr.ts
Normal file
29
docs/snippets/wasm/ocr/multi-language-ocr.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractMultilingualDocument() {
|
||||
await initWasm();
|
||||
await enableOcr();
|
||||
|
||||
const documents = [
|
||||
{ name: "english.png", lang: "eng" },
|
||||
{ name: "german.png", lang: "deu" },
|
||||
{ name: "spanish.png", lang: "spa" },
|
||||
];
|
||||
|
||||
for (const doc of documents) {
|
||||
const bytes = new Uint8Array(await fetch(doc.name).then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: doc.lang,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`${doc.name} (${doc.lang}):`);
|
||||
console.log(result.content);
|
||||
console.log("---");
|
||||
}
|
||||
}
|
||||
|
||||
extractMultilingualDocument().catch(console.error);
|
||||
30
docs/snippets/wasm/ocr/ocr-backend-registration.ts
Normal file
30
docs/snippets/wasm/ocr/ocr-backend-registration.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import {
|
||||
initWasm,
|
||||
listOcrBackends,
|
||||
registerOcrBackend,
|
||||
TesseractWasmBackend,
|
||||
unregisterOcrBackend,
|
||||
} from "@kreuzberg/wasm";
|
||||
|
||||
async function manageOcrBackends() {
|
||||
await initWasm();
|
||||
|
||||
const backend = new TesseractWasmBackend();
|
||||
await backend.initialize();
|
||||
|
||||
registerOcrBackend(backend);
|
||||
|
||||
const backends = listOcrBackends();
|
||||
console.log("Available OCR backends:", backends);
|
||||
|
||||
if (backends.includes("tesseract-wasm")) {
|
||||
console.log("Tesseract WASM backend is registered");
|
||||
}
|
||||
|
||||
unregisterOcrBackend("tesseract-wasm");
|
||||
|
||||
const afterUnregister = listOcrBackends();
|
||||
console.log("Backends after unregister:", afterUnregister);
|
||||
}
|
||||
|
||||
manageOcrBackends().catch(console.error);
|
||||
37
docs/snippets/wasm/ocr/ocr-error-handling.ts
Normal file
37
docs/snippets/wasm/ocr/ocr-error-handling.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithErrorHandling() {
|
||||
try {
|
||||
await initWasm();
|
||||
} catch (error) {
|
||||
console.error("Failed to initialize WASM:", error);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await enableOcr();
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes("browser")) {
|
||||
console.warn("OCR not available in this environment, proceeding without OCR");
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log("Extraction successful:", result.content.length, "chars");
|
||||
} catch (error) {
|
||||
console.error("Extraction failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
extractWithErrorHandling().catch(console.error);
|
||||
20
docs/snippets/wasm/ocr/ocr_easyocr.md
Normal file
20
docs/snippets/wasm/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// EasyOCR backend is not supported in WASM.
|
||||
// EasyOCR requires PyTorch and Python runtime, which are unavailable in browser/WASM.
|
||||
// Use the Tesseract-WASM backend instead, or route requests through a backend service.
|
||||
|
||||
const easyOcrConfig = {
|
||||
ocr: {
|
||||
backend: "easyocr", // Not supported in WASM
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
// This example shows the configuration structure for reference only.
|
||||
```
|
||||
24
docs/snippets/wasm/ocr/ocr_elements.md
Normal file
24
docs/snippets/wasm/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("scanned.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
element_config: {
|
||||
include_elements: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (result.ocr_elements) {
|
||||
for (const element of result.ocr_elements) {
|
||||
console.log("Text:", element.text);
|
||||
console.log("Confidence:", element.confidence);
|
||||
}
|
||||
}
|
||||
```
|
||||
34
docs/snippets/wasm/ocr/ocr_extraction.md
Normal file
34
docs/snippets/wasm/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```typescript title="WASM (Browser)"
|
||||
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
await enableOcr();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file, file.type, {
|
||||
ocr: {
|
||||
backend: "kreuzberg-tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
console.log(result.content);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM (Node.js / Deno / Bun)"
|
||||
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
await enableOcr(); // Uses native kreuzberg-tesseract backend
|
||||
|
||||
const result = await extractFile("./scanned_document.png", "image/png", {
|
||||
ocr: {
|
||||
backend: "kreuzberg-tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
console.log(result.content);
|
||||
```
|
||||
17
docs/snippets/wasm/ocr/ocr_force_all_pages.md
Normal file
17
docs/snippets/wasm/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
force_ocr: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
16
docs/snippets/wasm/ocr/ocr_multi_language.md
Normal file
16
docs/snippets/wasm/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const multilingualData = await fetch("multilingual.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(multilingualData, "application/pdf", {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu+fra",
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
20
docs/snippets/wasm/ocr/ocr_paddleocr.md
Normal file
20
docs/snippets/wasm/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// PaddleOCR backend is not supported in WASM.
|
||||
// PaddleOCR requires ONNX Runtime and native C++ dependencies unavailable in browser/WASM.
|
||||
// Use the Tesseract-WASM backend instead, or implement a backend wrapper service.
|
||||
|
||||
const paddleOcrConfig = {
|
||||
ocr: {
|
||||
backend: "paddleocr", // Not supported in WASM
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
// This example shows the configuration structure for reference only.
|
||||
```
|
||||
32
docs/snippets/wasm/ocr/progress-tracking.ts
Normal file
32
docs/snippets/wasm/ocr/progress-tracking.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import { extractBytes, initWasm, registerOcrBackend, TesseractWasmBackend } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithProgressTracking() {
|
||||
await initWasm();
|
||||
|
||||
const backend = new TesseractWasmBackend();
|
||||
|
||||
backend.setProgressCallback((progress: number) => {
|
||||
const progressBar = document.getElementById("progress");
|
||||
if (progressBar) {
|
||||
progressBar.style.width = `${progress}%`;
|
||||
progressBar.textContent = `${progress}%`;
|
||||
}
|
||||
});
|
||||
|
||||
await backend.initialize();
|
||||
registerOcrBackend(backend);
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log("OCR complete");
|
||||
console.log(result.content);
|
||||
}
|
||||
|
||||
extractWithProgressTracking().catch(console.error);
|
||||
Reference in New Issue
Block a user