Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/config/advanced_config.md
+++ b/docs/snippets/wasm/config/advanced_config.md
@@ -0,0 +1,35 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 100,
+  },
+  enable_language_detection: true,
+  enable_quality: true,
+};
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file, file.type, config);
+
+  if (result.chunks) {
+    for (const chunk of result.chunks) {
+      console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
+    }
+  }
+
+  if (result.detectedLanguages) {
+    console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
+  }
+}
+```
--- a/docs/snippets/wasm/config/basic-config.ts
+++ b/docs/snippets/wasm/config/basic-config.ts
@@ -0,0 +1,29 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "eng",
+    },
+    images: {
+      extractImages: true,
+      targetDpi: 200,
+    },
+    chunking: {
+      maxChars: 1000,
+      chunkOverlap: 100,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+  console.log("Extraction complete");
+  console.log("Content length:", result.content.length);
+}
+
+extractWithConfig().catch(console.error);
--- a/docs/snippets/wasm/config/chunking-config.ts
+++ b/docs/snippets/wasm/config/chunking-config.ts
@@ -0,0 +1,66 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithChunking() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    chunking: {
+      maxChars: 800,
+      chunkOverlap: 150,
+      splitOnNewlines: true,
+      splitOnSentences: true,
+    },
+  };
+
+  // Example: prepend heading context so each chunk carries its heading breadcrumb
+  const configWithHeadings: ExtractionConfig = {
+    chunking: {
+      chunkerType: "markdown",
+      maxChars: 800,
+      prependHeadingContext: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  if (result.chunks) {
+    console.log(`Total chunks: ${result.chunks.length}`);
+
+    result.chunks.slice(0, 3).forEach((chunk, i) => {
+      console.log(`\nChunk ${i}:`);
+      console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
+      console.log(`Content: ${chunk.content.substring(0, 100)}...`);
+    });
+  }
+}
+
+async function extractWithPrependHeadingContext() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    chunking: {
+      chunkerType: "markdown",
+      maxChars: 800,
+      prependHeadingContext: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "text/markdown", config);
+
+  if (result.chunks) {
+    console.log(`Total chunks: ${result.chunks.length}`);
+
+    result.chunks.slice(0, 3).forEach((chunk, i) => {
+      // Each chunk's content is prefixed with its heading breadcrumb
+      console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
+    });
+  }
+}
+
+extractWithChunking().catch(console.error);
+extractWithPrependHeadingContext().catch(console.error);
--- a/docs/snippets/wasm/config/chunking_config.md
+++ b/docs/snippets/wasm/config/chunking_config.md
@@ -0,0 +1,71 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 100,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
+  console.log(`Tokens: ${chunk.metadata?.token_count}`);
+});
+```
+
+```typescript title="WASM - Markdown with Heading Context"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 2000,
+    // Note: Token-based sizing is not available in WASM builds.
+    // Use character-based sizing instead.
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
+
+  if (chunk.metadata?.headingContext?.headings) {
+    console.log("Headings:");
+    chunk.metadata.headingContext.headings.forEach((h) => {
+      console.log(`  Level ${h.level}: ${h.text}`);
+    });
+  }
+});
+```
+
+```typescript title="WASM - Prepend Heading Context"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 2000,
+    prependHeadingContext: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  // Each chunk's content is prefixed with its heading breadcrumb
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
+});
+```
--- a/docs/snippets/wasm/config/combined-config.ts
+++ b/docs/snippets/wasm/config/combined-config.ts
@@ -0,0 +1,34 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithFullConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "deu",
+    },
+    chunking: {
+      maxChars: 1000,
+      chunkOverlap: 200,
+    },
+    images: {
+      extractImages: true,
+      targetDpi: 200,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  console.log("=== Extraction Results ===");
+  console.log(`Content: ${result.content.length} chars`);
+  console.log(`Chunks: ${result.chunks?.length ?? 0}`);
+  console.log(`Images: ${result.images?.length ?? 0}`);
+  console.log(`Tables: ${result.tables.length}`);
+  console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
+}
+
+extractWithFullConfig().catch(console.error);
--- a/docs/snippets/wasm/config/conditional-config.ts
+++ b/docs/snippets/wasm/config/conditional-config.ts
@@ -0,0 +1,34 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithConditionalConfig(fileSize: number) {
+  await initWasm();
+
+  const config: ExtractionConfig = {};
+
+  if (fileSize > 10 * 1024 * 1024) {
+    config.chunking = {
+      maxChars: 500,
+      chunkOverlap: 50,
+    };
+  }
+
+  if (fileSize < 1 * 1024 * 1024) {
+    config.images = {
+      extractImages: true,
+      targetDpi: 300,
+    };
+  }
+
+  config.ocr = {
+    enabled: fileSize < 50 * 1024 * 1024,
+  };
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  return result;
+}
+
+extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));
--- a/docs/snippets/wasm/config/config_basic.md
+++ b/docs/snippets/wasm/config/config_basic.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+  images: {
+    extractImages: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/config/config_discover.md
+++ b/docs/snippets/wasm/config/config_discover.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  use_cache: true,
+  enable_quality_processing: true,
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/config/config_ocr.md
+++ b/docs/snippets/wasm/config/config_ocr.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Content length: ${result.content.length}`);
+console.log(`Tables detected: ${result.tables?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/config_programmatic.md
+++ b/docs/snippets/wasm/config/config_programmatic.md
@@ -0,0 +1,26 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  use_cache: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseract_config: {
+      psm: 6,
+    },
+  },
+  chunking: {
+    max_characters: 1000,
+    overlap: 200,
+  },
+  enable_quality_processing: true,
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Content length: ${result.content.length}`);
+```
--- a/docs/snippets/wasm/config/document_structure_config.md
+++ b/docs/snippets/wasm/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```typescript title="Document Structure Config (WASM)"
+import { extractBytes } from "kreuzberg-wasm";
+
+const config = {
+  includeDocumentStructure: true,
+};
+
+const result = extractBytes(fileBuffer, "application/pdf", config);
+
+if (result.document) {
+  for (const node of result.document.nodes) {
+    console.log(`[${node.content.nodeType}]`);
+  }
+}
+```
--- a/docs/snippets/wasm/config/element_based_output.md
+++ b/docs/snippets/wasm/config/element_based_output.md
@@ -0,0 +1,35 @@
+```typescript title="Element-Based Output (WASM)"
+import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
+
+// Configure element-based output
+const config: ExtractionConfig = {
+  outputFormat: "element_based",
+};
+
+// Extract document
+const result = extractFileSync(fileBuffer, "application/pdf", config);
+
+// Access elements
+for (const element of result.elements) {
+  console.log(`Type: ${element.elementType}`);
+  console.log(`Text: ${element.text.slice(0, 100)}`);
+
+  if (element.metadata.pageNumber) {
+    console.log(`Page: ${element.metadata.pageNumber}`);
+  }
+
+  if (element.metadata.coordinates) {
+    const coords = element.metadata.coordinates;
+    console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
+  }
+
+  console.log("---");
+}
+
+// Filter by element type
+const titles = result.elements.filter((e) => e.elementType === "title");
+for (const title of titles) {
+  const level = title.metadata.additional?.level || "unknown";
+  console.log(`[${level}] ${title.text}`);
+}
+```
--- a/docs/snippets/wasm/config/hierarchy_config.ts
+++ b/docs/snippets/wasm/config/hierarchy_config.ts
@@ -0,0 +1,105 @@
+import {
+  type ExtractionConfig,
+  type HierarchyConfig,
+  Kreuzberg,
+  type PdfConfig,
+} from "kreuzberg-wasm";
+
+// Example 1: Basic hierarchy extraction
+// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
+// Extract bounding box information for spatial layout awareness.
+const hierarchyConfigBasic: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
+  includeBbox: true, // Include bounding box coordinates
+  ocrCoverageThreshold: undefined, // No OCR coverage threshold
+};
+
+const pdfConfigBasic: PdfConfig = {
+  hierarchy: hierarchyConfigBasic,
+};
+
+const extractionConfigBasic: ExtractionConfig = {
+  pdfOptions: pdfConfigBasic,
+};
+
+// const kreuzberg = new Kreuzberg(extractionConfigBasic);
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Example 2: Custom kClusters for minimal structure
+// Use 3 clusters for simpler hierarchy with minimal structure.
+// Useful when you only need major section divisions (Main, Subsection, Detail).
+const hierarchyConfigMinimal: HierarchyConfig = {
+  enabled: true,
+  kClusters: 3, // Minimal clustering: just 3 levels
+  includeBbox: true,
+  ocrCoverageThreshold: undefined,
+};
+
+const pdfConfigMinimal: PdfConfig = {
+  hierarchy: hierarchyConfigMinimal,
+};
+
+const _extractionConfigMinimal: ExtractionConfig = {
+  pdfOptions: pdfConfigMinimal,
+};
+
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Example 3: With OCR coverage threshold
+// Trigger OCR if less than 50% of text has font data.
+// Useful for documents with mixed digital and scanned content.
+const hierarchyConfigOcr: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6,
+  includeBbox: true,
+  ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
+};
+
+const pdfConfigOcr: PdfConfig = {
+  hierarchy: hierarchyConfigOcr,
+};
+
+const _extractionConfigOcr: ExtractionConfig = {
+  pdfOptions: pdfConfigOcr,
+};
+
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Integration with Kreuzberg WASM instance
+async function _extractWithHierarchy(): Promise<void> {
+  const config = extractionConfigBasic;
+  const kreuzberg = new Kreuzberg(config);
+
+  try {
+    // Extract from file (requires file input or fetch)
+    const result = await kreuzberg.extractFile("document.pdf");
+    console.log("Extraction complete:", result);
+  } catch (error) {
+    console.error("Extraction failed:", error);
+  }
+}
+
+// Field descriptions:
+//
+// enabled: boolean (default: true)
+//   - Enable or disable hierarchy extraction
+//   - When false, hierarchy structure is not analyzed
+//
+// kClusters: number (default: 6, valid: 1-7)
+//   - Number of font size clusters for hierarchy levels
+//   - 6 provides H1-H6 heading levels with body text
+//   - Higher values create more fine-grained hierarchy
+//   - Lower values create simpler structure
+//
+// includeBbox: boolean (default: true)
+//   - Include bounding box coordinates in hierarchy blocks
+//   - Required for spatial layout awareness and document structure
+//   - Set to false only if space optimization is critical
+//
+// ocrCoverageThreshold: number | undefined (default: undefined)
+//   - Range: 0.0 to 1.0
+//   - Triggers OCR when text block coverage falls below this fraction
+//   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+//   - undefined means no OCR coverage-based triggering
+//
--- a/docs/snippets/wasm/config/html_output.md
+++ b/docs/snippets/wasm/config/html_output.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  output_format: "html",
+  html_output: {
+    theme: "github",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(result.content); // HTML with kb-* classes
+```
--- a/docs/snippets/wasm/config/image-config.ts
+++ b/docs/snippets/wasm/config/image-config.ts
@@ -0,0 +1,31 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractImagesWithConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    images: {
+      extractImages: true,
+      targetDpi: 300,
+      maxDimension: 2048,
+      preserveAspectRatio: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  if (result.images) {
+    console.log(`Extracted ${result.images.length} images`);
+
+    result.images.forEach((image) => {
+      console.log(
+        `Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
+      );
+    });
+  }
+}
+
+extractImagesWithConfig().catch(console.error);
--- a/docs/snippets/wasm/config/keyword_config.ts
+++ b/docs/snippets/wasm/config/keyword_config.ts
@@ -0,0 +1,65 @@
+import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
+
+// Example 1: Basic YAKE configuration
+// Uses YAKE algorithm with default parameters and English stopword filtering
+async function basicYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 10,
+      minScore: 0.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 2: Advanced YAKE with custom parameters
+// Fine-tunes YAKE with custom window size for co-occurrence analysis
+async function _advancedYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 15,
+      minScore: 0.1,
+      ngramRange: [1, 2],
+      language: "en",
+      yakeParams: {
+        windowSize: 1,
+      },
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 3: RAKE configuration
+// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+async function _rakeConfig(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "rake",
+      maxKeywords: 10,
+      minScore: 5.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: {
+        minWordLength: 1,
+        maxWordsPerPhrase: 3,
+      },
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+basicYake();
--- a/docs/snippets/wasm/config/keyword_extraction_config.md
+++ b/docs/snippets/wasm/config/keyword_extraction_config.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  keywords: {
+    algorithm: "yake",
+    max_keywords: 10,
+    min_score: 0.1,
+    ngram_range: [1, 3],
+    language: "en",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
+```
--- a/docs/snippets/wasm/config/language_detection_config.md
+++ b/docs/snippets/wasm/config/language_detection_config.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  language_detection: {
+    enabled: true,
+    min_confidence: 0.8,
+    detect_multiple: true,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Detected language: ${result.language}`);
+console.log(`Confidence: ${result.language_confidence}`);
+```
--- a/docs/snippets/wasm/config/ocr-config.ts
+++ b/docs/snippets/wasm/config/ocr-config.ts
@@ -0,0 +1,26 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithOcr() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "eng",
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  console.log("Extracted text from scanned document:");
+  console.log(result.content);
+
+  if (result.detectedLanguages) {
+    console.log("Detected languages:", result.detectedLanguages);
+  }
+}
+
+extractWithOcr().catch(console.error);
--- a/docs/snippets/wasm/config/ocr_dpi_config.md
+++ b/docs/snippets/wasm/config/ocr_dpi_config.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  images: {
+    extract_images: true,
+    target_dpi: 300,
+    max_image_dimension: 4096,
+    auto_adjust_dpi: true,
+    min_dpi: 150,
+    max_dpi: 600,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Extracted images: ${result.images?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/pdf_config.md
+++ b/docs/snippets/wasm/config/pdf_config.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  pdf_options: {
+    extract_images: true,
+    passwords: ["password123"],
+    extract_metadata: true,
+    hierarchy: {},
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Title: ${result.metadata?.title}`);
+console.log(`Authors: ${result.metadata?.authors}`);
+```
--- a/docs/snippets/wasm/config/pdf_hierarchy_config.md
+++ b/docs/snippets/wasm/config/pdf_hierarchy_config.md
@@ -0,0 +1,22 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  pdf_options: {
+    hierarchy: {
+      enabled: true,
+      detection_threshold: 0.75,
+      ocr_coverage_threshold: 0.8,
+      min_level: 1,
+      max_level: 5,
+    },
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/postprocessor_config.md
+++ b/docs/snippets/wasm/config/postprocessor_config.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  postprocessor: {
+    enabled: true,
+    enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Processed content: ${result.content}`);
+```
--- a/docs/snippets/wasm/config/quality_processing_config.md
+++ b/docs/snippets/wasm/config/quality_processing_config.md
@@ -0,0 +1,16 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  enable_quality_processing: true,
+  use_cache: true,
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Quality score: ${result.quality_score}`);
+console.log(`Processing time: ${result.processing_time}`);
+```
--- a/docs/snippets/wasm/config/tesseract_config.md
+++ b/docs/snippets/wasm/config/tesseract_config.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseract_config: {
+      psm: 6,
+      oem: 3,
+    },
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`OCR text: ${result.content}`);
+```
--- a/docs/snippets/wasm/config/token_reduction_config.md
+++ b/docs/snippets/wasm/config/token_reduction_config.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  token_reduction: {
+    mode: "moderate",
+    preserve_important_words: true,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Original tokens: ${result.token_count}`);
+console.log(`Reduced content: ${result.content}`);
+```