Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/wasm/api/batch_extract_bytes_sync.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const urls = ["document1.pdf", "document2.pdf"];
+
+const results = await Promise.all(
+  urls.map(async (url) => {
+    const resp = await fetch(url);
+    const bytes = new Uint8Array(await resp.arrayBuffer());
+    return extractBytes(bytes, "application/pdf", undefined);
+  }),
+);
+
+results.forEach((result, i) => {
+  console.log(`Document ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/wasm/api/batch_extract_files_sync.md
+++ b/docs/snippets/wasm/api/batch_extract_files_sync.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const input = document.getElementById("files") as HTMLInputElement;
+const files = Array.from(input.files ?? []);
+
+const results = await Promise.all(
+  files.map(async (file) => {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    return extractBytes(bytes, file.type || "application/pdf", undefined);
+  }),
+);
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/wasm/api/client_chunk_text.md
+++ b/docs/snippets/wasm/api/client_chunk_text.md
@@ -0,0 +1,28 @@
+```typescript title="WASM"
+// HTTP client approach for chunking text via the REST API
+// Useful in browsers where WASM extraction is called server-side
+
+const text = "This is a long document that needs to be split into semantic chunks.";
+
+const response = await fetch("http://localhost:8000/chunk", {
+  method: "POST",
+  headers: { "Content-Type": "application/json" },
+  body: JSON.stringify({
+    text,
+    chunker_type: "text",
+    config: {
+      chunking: {
+        strategy: "semantic",
+        max_chunk_size: 512,
+        overlap: 50,
+      },
+    },
+  }),
+});
+
+const result = await response.json();
+console.log(`Created ${result.chunks?.length ?? 0} chunks`);
+result.chunks?.forEach((chunk) => {
+  console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
+});
+```
--- a/docs/snippets/wasm/api/client_extract_single_file.md
+++ b/docs/snippets/wasm/api/client_extract_single_file.md
@@ -0,0 +1,22 @@
+```typescript title="WASM"
+// HTTP client approach for file extraction via multipart upload
+// Use this when uploading files from a browser form
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const formData = new FormData();
+  formData.append("file", file);
+  formData.append("mime_type", file.type || "application/octet-stream");
+
+  const response = await fetch("http://localhost:8000/extract/file", {
+    method: "POST",
+    body: formData,
+  });
+
+  const result = await response.json();
+  console.log(`Extracted ${result.content.length} characters`);
+  console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
+}
+```
--- a/docs/snippets/wasm/api/combining_all_features.md
+++ b/docs/snippets/wasm/api/combining_all_features.md
@@ -0,0 +1,69 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+
+  // Build a comprehensive extraction config
+  const config = {
+    use_cache: true,
+    enable_quality_processing: true,
+    output_format: "markdown",
+    include_document_structure: true,
+
+    // Chunking configuration
+    chunking: {
+      strategy: "semantic",
+      max_chunk_size: 1024,
+      overlap: 100,
+    },
+
+    // Image extraction configuration
+    images: {
+      extract_images: true,
+      extract_base64: false,
+      extract_raw_bytes: false,
+    },
+
+    // OCR configuration
+    ocr: {
+      backend: "tesseract",
+      languages: ["eng"],
+      enabled: true,
+    },
+
+    // HTML-specific extraction options
+    html_options: "article, main, .content",
+
+    // PDF-specific options
+    pdf_options: {
+      ocr_strategy: "auto",
+      preserve_images: true,
+    },
+
+    // Security limits
+    security_limits: {
+      max_archive_size: 524288000,
+      max_file_count: 10000,
+      max_compression_ratio: 100,
+    },
+  };
+
+  try {
+    const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
+    console.log(`Content: ${result.content.substring(0, 100)}...`);
+    console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
+    console.log(`Chunks: ${result.chunks?.length ?? 0}`);
+    if (result.images?.length) {
+      console.log(`Images: ${result.images.length}`);
+    }
+  } catch (err) {
+    console.error("Extraction failed:", err);
+  }
+}
+```
--- a/docs/snippets/wasm/api/error_handling.md
+++ b/docs/snippets/wasm/api/error_handling.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  try {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+    console.log(`Extracted: ${result.content.length} characters`);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    console.error("Extraction failed:", message);
+  }
+}
+```
--- a/docs/snippets/wasm/api/error_handling_extract.md
+++ b/docs/snippets/wasm/api/error_handling_extract.md
@@ -0,0 +1,40 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Note: WASM has no native batch API; use Promise.all with per-item error handling
+const files = document.getElementById("files") as HTMLInputElement;
+const fileList = files.files || [];
+
+// Extract multiple files concurrently (simulated batch)
+const extractionPromises = Array.from(fileList).map(async (file) => {
+  try {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
+    return { file: file.name, success: true, result };
+  } catch (err) {
+    return {
+      file: file.name,
+      success: false,
+      error: err instanceof Error ? err.message : String(err),
+    };
+  }
+});
+
+const results = await Promise.all(extractionPromises);
+
+// Process results with per-item error handling
+results.forEach((item) => {
+  if (item.success) {
+    console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
+  } else {
+    console.error(`✗ ${item.file}: ${item.error}`);
+  }
+});
+
+// Summary
+const succeeded = results.filter((r) => r.success).length;
+const failed = results.filter((r) => !r.success).length;
+console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
+```
--- a/docs/snippets/wasm/api/error_handling_wasm.md
+++ b/docs/snippets/wasm/api/error_handling_wasm.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+try {
+  const result = await extractBytes(data, "application/pdf", undefined);
+  console.log(`Success: ${result.content.length} characters`);
+} catch (error) {
+  if (error instanceof Error) {
+    console.error("Extraction error:", error.message);
+  }
+}
+```
--- a/docs/snippets/wasm/api/extract_bytes_async.md
+++ b/docs/snippets/wasm/api/extract_bytes_async.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(`Extracted: ${result.content.length} characters`);
+console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+```
--- a/docs/snippets/wasm/api/extract_bytes_sync.md
+++ b/docs/snippets/wasm/api/extract_bytes_sync.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+// WASM exposes only async extractBytes; await it from any async context.
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/api/extract_file_async.md
+++ b/docs/snippets/wasm/api/extract_file_async.md
@@ -0,0 +1,14 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+  console.log(`Content length: ${result.content.length} characters`);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+```
--- a/docs/snippets/wasm/api/extract_file_sync.md
+++ b/docs/snippets/wasm/api/extract_file_sync.md
@@ -0,0 +1,15 @@
+```typescript title="WASM"
+// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+  console.log(result.content);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+```
--- a/docs/snippets/wasm/api/tables.md
+++ b/docs/snippets/wasm/api/tables.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+
+  result.tables?.forEach((table) => {
+    console.log(`Table with ${table.cells?.length ?? 0} rows`);
+    if (table.markdown) {
+      console.log(table.markdown);
+    }
+    table.cells?.forEach((row) => console.log(row.join(" | ")));
+  });
+}
+```