This commit is contained in:
20
docs/snippets/wasm/api/batch_extract_bytes_sync.md
Normal file
20
docs/snippets/wasm/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const urls = ["document1.pdf", "document2.pdf"];
|
||||
|
||||
const results = await Promise.all(
|
||||
urls.map(async (url) => {
|
||||
const resp = await fetch(url);
|
||||
const bytes = new Uint8Array(await resp.arrayBuffer());
|
||||
return extractBytes(bytes, "application/pdf", undefined);
|
||||
}),
|
||||
);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`Document ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
20
docs/snippets/wasm/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/wasm/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const input = document.getElementById("files") as HTMLInputElement;
|
||||
const files = Array.from(input.files ?? []);
|
||||
|
||||
const results = await Promise.all(
|
||||
files.map(async (file) => {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
return extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
}),
|
||||
);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
28
docs/snippets/wasm/api/client_chunk_text.md
Normal file
28
docs/snippets/wasm/api/client_chunk_text.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```typescript title="WASM"
|
||||
// HTTP client approach for chunking text via the REST API
|
||||
// Useful in browsers where WASM extraction is called server-side
|
||||
|
||||
const text = "This is a long document that needs to be split into semantic chunks.";
|
||||
|
||||
const response = await fetch("http://localhost:8000/chunk", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
chunker_type: "text",
|
||||
config: {
|
||||
chunking: {
|
||||
strategy: "semantic",
|
||||
max_chunk_size: 512,
|
||||
overlap: 50,
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
console.log(`Created ${result.chunks?.length ?? 0} chunks`);
|
||||
result.chunks?.forEach((chunk) => {
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
|
||||
});
|
||||
```
|
||||
22
docs/snippets/wasm/api/client_extract_single_file.md
Normal file
22
docs/snippets/wasm/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="WASM"
|
||||
// HTTP client approach for file extraction via multipart upload
|
||||
// Use this when uploading files from a browser form
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const formData = new FormData();
|
||||
formData.append("file", file);
|
||||
formData.append("mime_type", file.type || "application/octet-stream");
|
||||
|
||||
const response = await fetch("http://localhost:8000/extract/file", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
console.log(`Extracted ${result.content.length} characters`);
|
||||
console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
|
||||
}
|
||||
```
|
||||
69
docs/snippets/wasm/api/combining_all_features.md
Normal file
69
docs/snippets/wasm/api/combining_all_features.md
Normal file
@@ -0,0 +1,69 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
|
||||
// Build a comprehensive extraction config
|
||||
const config = {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
output_format: "markdown",
|
||||
include_document_structure: true,
|
||||
|
||||
// Chunking configuration
|
||||
chunking: {
|
||||
strategy: "semantic",
|
||||
max_chunk_size: 1024,
|
||||
overlap: 100,
|
||||
},
|
||||
|
||||
// Image extraction configuration
|
||||
images: {
|
||||
extract_images: true,
|
||||
extract_base64: false,
|
||||
extract_raw_bytes: false,
|
||||
},
|
||||
|
||||
// OCR configuration
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
languages: ["eng"],
|
||||
enabled: true,
|
||||
},
|
||||
|
||||
// HTML-specific extraction options
|
||||
html_options: "article, main, .content",
|
||||
|
||||
// PDF-specific options
|
||||
pdf_options: {
|
||||
ocr_strategy: "auto",
|
||||
preserve_images: true,
|
||||
},
|
||||
|
||||
// Security limits
|
||||
security_limits: {
|
||||
max_archive_size: 524288000,
|
||||
max_file_count: 10000,
|
||||
max_compression_ratio: 100,
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
|
||||
console.log(`Content: ${result.content.substring(0, 100)}...`);
|
||||
console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
|
||||
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
|
||||
if (result.images?.length) {
|
||||
console.log(`Images: ${result.images.length}`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Extraction failed:", err);
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/wasm/api/error_handling.md
Normal file
19
docs/snippets/wasm/api/error_handling.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
try {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(`Extracted: ${result.content.length} characters`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error("Extraction failed:", message);
|
||||
}
|
||||
}
|
||||
```
|
||||
40
docs/snippets/wasm/api/error_handling_extract.md
Normal file
40
docs/snippets/wasm/api/error_handling_extract.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Note: WASM has no native batch API; use Promise.all with per-item error handling
|
||||
const files = document.getElementById("files") as HTMLInputElement;
|
||||
const fileList = files.files || [];
|
||||
|
||||
// Extract multiple files concurrently (simulated batch)
|
||||
const extractionPromises = Array.from(fileList).map(async (file) => {
|
||||
try {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
|
||||
return { file: file.name, success: true, result };
|
||||
} catch (err) {
|
||||
return {
|
||||
file: file.name,
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
const results = await Promise.all(extractionPromises);
|
||||
|
||||
// Process results with per-item error handling
|
||||
results.forEach((item) => {
|
||||
if (item.success) {
|
||||
console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
|
||||
} else {
|
||||
console.error(`✗ ${item.file}: ${item.error}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Summary
|
||||
const succeeded = results.filter((r) => r.success).length;
|
||||
const failed = results.filter((r) => !r.success).length;
|
||||
console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
|
||||
```
|
||||
17
docs/snippets/wasm/api/error_handling_wasm.md
Normal file
17
docs/snippets/wasm/api/error_handling_wasm.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
try {
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(`Success: ${result.content.length} characters`);
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
console.error("Extraction error:", error.message);
|
||||
}
|
||||
}
|
||||
```
|
||||
12
docs/snippets/wasm/api/extract_bytes_async.md
Normal file
12
docs/snippets/wasm/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(`Extracted: ${result.content.length} characters`);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
```
|
||||
12
docs/snippets/wasm/api/extract_bytes_sync.md
Normal file
12
docs/snippets/wasm/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
// WASM exposes only async extractBytes; await it from any async context.
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
```
|
||||
14
docs/snippets/wasm/api/extract_file_async.md
Normal file
14
docs/snippets/wasm/api/extract_file_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(`Content length: ${result.content.length} characters`);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/wasm/api/extract_file_sync.md
Normal file
15
docs/snippets/wasm/api/extract_file_sync.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="WASM"
|
||||
// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
```
|
||||
21
docs/snippets/wasm/api/tables.md
Normal file
21
docs/snippets/wasm/api/tables.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
|
||||
result.tables?.forEach((table) => {
|
||||
console.log(`Table with ${table.cells?.length ?? 0} rows`);
|
||||
if (table.markdown) {
|
||||
console.log(table.markdown);
|
||||
}
|
||||
table.cells?.forEach((row) => console.log(row.join(" | ")));
|
||||
});
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user