Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const urls = ["document1.pdf", "document2.pdf"];
const results = await Promise.all(
urls.map(async (url) => {
const resp = await fetch(url);
const bytes = new Uint8Array(await resp.arrayBuffer());
return extractBytes(bytes, "application/pdf", undefined);
}),
);
results.forEach((result, i) => {
console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const input = document.getElementById("files") as HTMLInputElement;
const files = Array.from(input.files ?? []);
const results = await Promise.all(
files.map(async (file) => {
const bytes = new Uint8Array(await file.arrayBuffer());
return extractBytes(bytes, file.type || "application/pdf", undefined);
}),
);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,28 @@
```typescript title="WASM"
// HTTP client approach for chunking text via the REST API
// Useful in browsers where WASM extraction is called server-side
const text = "This is a long document that needs to be split into semantic chunks.";
const response = await fetch("http://localhost:8000/chunk", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text,
chunker_type: "text",
config: {
chunking: {
strategy: "semantic",
max_chunk_size: 512,
overlap: 50,
},
},
}),
});
const result = await response.json();
console.log(`Created ${result.chunks?.length ?? 0} chunks`);
result.chunks?.forEach((chunk) => {
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
});
```

View File

@@ -0,0 +1,22 @@
```typescript title="WASM"
// HTTP client approach for file extraction via multipart upload
// Use this when uploading files from a browser form
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const formData = new FormData();
formData.append("file", file);
formData.append("mime_type", file.type || "application/octet-stream");
const response = await fetch("http://localhost:8000/extract/file", {
method: "POST",
body: formData,
});
const result = await response.json();
console.log(`Extracted ${result.content.length} characters`);
console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
}
```

View File

@@ -0,0 +1,69 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
// Build a comprehensive extraction config
const config = {
use_cache: true,
enable_quality_processing: true,
output_format: "markdown",
include_document_structure: true,
// Chunking configuration
chunking: {
strategy: "semantic",
max_chunk_size: 1024,
overlap: 100,
},
// Image extraction configuration
images: {
extract_images: true,
extract_base64: false,
extract_raw_bytes: false,
},
// OCR configuration
ocr: {
backend: "tesseract",
languages: ["eng"],
enabled: true,
},
// HTML-specific extraction options
html_options: "article, main, .content",
// PDF-specific options
pdf_options: {
ocr_strategy: "auto",
preserve_images: true,
},
// Security limits
security_limits: {
max_archive_size: 524288000,
max_file_count: 10000,
max_compression_ratio: 100,
},
};
try {
const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
console.log(`Content: ${result.content.substring(0, 100)}...`);
console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
if (result.images?.length) {
console.log(`Images: ${result.images.length}`);
}
} catch (err) {
console.error("Extraction failed:", err);
}
}
```

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error("Extraction failed:", message);
}
}
```

View File

@@ -0,0 +1,40 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Note: WASM has no native batch API; use Promise.all with per-item error handling
const files = document.getElementById("files") as HTMLInputElement;
const fileList = files.files || [];
// Extract multiple files concurrently (simulated batch)
const extractionPromises = Array.from(fileList).map(async (file) => {
try {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
return { file: file.name, success: true, result };
} catch (err) {
return {
file: file.name,
success: false,
error: err instanceof Error ? err.message : String(err),
};
}
});
const results = await Promise.all(extractionPromises);
// Process results with per-item error handling
results.forEach((item) => {
if (item.success) {
console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
} else {
console.error(`✗ ${item.file}: ${item.error}`);
}
});
// Summary
const succeeded = results.filter((r) => r.success).length;
const failed = results.filter((r) => !r.success).length;
console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
try {
const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Success: ${result.content.length} characters`);
} catch (error) {
if (error instanceof Error) {
console.error("Extraction error:", error.message);
}
}
```

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
```

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
// WASM exposes only async extractBytes; await it from any async context.
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(result.content);
```

View File

@@ -0,0 +1,14 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(`Content length: ${result.content.length} characters`);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
```

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(result.content);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
```

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
result.tables?.forEach((table) => {
console.log(`Table with ${table.cells?.length ?? 0} rows`);
if (table.markdown) {
console.log(table.markdown);
}
table.cells?.forEach((row) => console.log(row.join(" | ")));
});
}
```