Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/advanced/chunk_page_mapping.md
+++ b/docs/snippets/wasm/advanced/chunk_page_mapping.md
@@ -0,0 +1,44 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 1500,
+    chunkOverlap: 300,
+  },
+  includeDocumentStructure: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Map chunks back to pages for source attribution
+const chunkPageMap = new Map<number, number[]>();
+
+result.chunks?.forEach((chunk, chunkIndex) => {
+  const firstPage = chunk.metadata?.firstPage;
+  const lastPage = chunk.metadata?.lastPage;
+
+  if (firstPage !== undefined && lastPage !== undefined) {
+    for (let page = firstPage; page <= lastPage; page++) {
+      if (!chunkPageMap.has(page)) {
+        chunkPageMap.set(page, []);
+      }
+      chunkPageMap.get(page)!.push(chunkIndex);
+    }
+  }
+});
+
+// Use the mapping for source attribution
+chunkPageMap.forEach((chunkIndices, pageNum) => {
+  console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
+  chunkIndices.forEach((idx) => {
+    const chunk = result.chunks![idx];
+    console.log(`  Content: "${chunk.content.substring(0, 60)}..."`);
+  });
+});
+```
+
+**Snippet:syntax-only** - Requires document structure parsing during extraction.
--- a/docs/snippets/wasm/advanced/chunking_config.md
+++ b/docs/snippets/wasm/advanced/chunking_config.md
@@ -0,0 +1,70 @@
+```typescript title="WASM - Fixed-Size Chunks"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 2000,
+    chunkOverlap: 400,
+    trim: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
+  console.log(`  Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
+  console.log(`  Content: "${chunk.content.substring(0, 50)}..."`);
+});
+```
+
+```typescript title="WASM - Markdown-Aware Chunking"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 1500,
+    prependHeadingContext: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk) => {
+  // Content already includes heading context prepended
+  console.log(chunk.content.substring(0, 80));
+  console.log(
+    `  Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
+  );
+});
+```
+
+```typescript title="WASM - Semantic Chunking with Topic Threshold"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    chunkerType: "semantic",
+    maxChars: 1000,
+    topicThreshold: 0.5, // Boundary detection at 50% topic change
+    chunkOverlap: 100,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+console.log(`Generated ${result.chunks?.length} semantic chunks`);
+result.chunks?.forEach((chunk) => {
+  console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
+});
+```
--- a/docs/snippets/wasm/advanced/chunking_rag.md
+++ b/docs/snippets/wasm/advanced/chunking_rag.md
@@ -0,0 +1,80 @@
+```typescript title="WASM - Chunking for RAG Pipeline"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 512, // Smaller chunks for vector DB efficiency
+    chunkOverlap: 100,
+    trim: true,
+  },
+  includeDocumentStructure: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Prepare chunks for vector database ingestion
+const ragChunks =
+  result.chunks?.map((chunk, idx) => ({
+    id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
+    text: chunk.content,
+    metadata: {
+      source: result.metadata?.filename,
+      chunk_index: chunk.metadata?.chunkIndex,
+      total_chunks: chunk.metadata?.totalChunks,
+      first_page: chunk.metadata?.firstPage,
+      last_page: chunk.metadata?.lastPage,
+      byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
+    },
+    // Embedding would be added by vector DB embedding model
+  })) || [];
+
+console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
+ragChunks.slice(0, 3).forEach((c) => {
+  console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
+});
+
+// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
+// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
+//   id: c.id,
+//   values: await embedModel.embed(c.text),
+//   metadata: c.metadata,
+// })));
+```
+
+```typescript title="WASM - RAG with Retrieval Context"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 768,
+    chunkOverlap: 200,
+    prependHeadingContext: true, // For markdown/docs
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+// Build retrieval augmented context
+interface RagDocument {
+  id: string;
+  query_text: string;
+  context: string;
+  page: number | null;
+}
+
+const ragDocs: RagDocument[] =
+  result.chunks?.map((chunk, idx) => ({
+    id: `chunk_${idx}`,
+    query_text: chunk.content, // Text to embed and search
+    context: chunk.content, // Full context (includes heading)
+    page: chunk.metadata?.firstPage ?? null,
+  })) || [];
+
+console.log(`Built RAG documents: ${ragDocs.length}`);
+```
--- a/docs/snippets/wasm/advanced/custom-pipeline.ts
+++ b/docs/snippets/wasm/advanced/custom-pipeline.ts
@@ -0,0 +1,50 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface ProcessingStep {
+  name: string;
+  process: (result: ExtractionResult) => Promise<ExtractionResult>;
+}
+
+async function createExtractionPipeline(
+  steps: ProcessingStep[],
+  bytes: Uint8Array,
+  mimeType: string,
+) {
+  await initWasm();
+
+  let result = await extractBytes(bytes, mimeType);
+
+  for (const step of steps) {
+    console.log(`Executing step: ${step.name}`);
+    result = await step.process(result);
+  }
+
+  return result;
+}
+
+const pipeline: ProcessingStep[] = [
+  {
+    name: "Text Normalization",
+    process: async (result) => ({
+      ...result,
+      content: result.content.replace(/\s+/g, " ").trim(),
+    }),
+  },
+  {
+    name: "Language Detection",
+    process: async (result) => result,
+  },
+  {
+    name: "Chunking",
+    process: async (result) => result,
+  },
+];
+
+createExtractionPipeline(
+  pipeline,
+  new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
+  "application/pdf",
+)
+  .then((r) => console.log("Pipeline complete:", r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/error-recovery.ts
+++ b/docs/snippets/wasm/advanced/error-recovery.ts
@@ -0,0 +1,34 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
+  await initWasm();
+
+  let lastError: Error | null = null;
+
+  for (let attempt = 1; attempt <= maxRetries; attempt++) {
+    try {
+      console.log(`Extraction attempt ${attempt}/${maxRetries}`);
+      const result = await extractBytes(bytes, mimeType);
+      console.log("Extraction successful");
+      return result;
+    } catch (error) {
+      lastError = error instanceof Error ? error : new Error(String(error));
+      console.warn(`Attempt ${attempt} failed:`, lastError.message);
+
+      if (attempt < maxRetries) {
+        const delay = 2 ** attempt * 100;
+        console.log(`Retrying in ${delay}ms...`);
+        await new Promise((resolve) => setTimeout(resolve, delay));
+      }
+    }
+  }
+
+  throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
+}
+
+extractWithRetry(
+  new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
+  "application/pdf",
+)
+  .then((r) => console.log("Final result:", r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/keyword_extraction_config.md
+++ b/docs/snippets/wasm/advanced/keyword_extraction_config.md
@@ -0,0 +1,105 @@
+```typescript title="WASM - Keyword Extraction Setup"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Note: Keyword extraction requires the 'keywords' feature,
+// which may not be available in all WASM builds.
+// This example shows the configuration structure.
+
+const config = {
+  // Extraction configuration
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Keyword extraction would be performed on the extracted text
+// using external libraries or post-processing
+console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
+
+// Example post-processing to extract keywords
+// (requires external keyword extraction library)
+const keywords = new Set<string>();
+const words = result.content
+  .toLowerCase()
+  .split(/\s+/)
+  .filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
+
+words.forEach((word) => {
+  keywords.add(word);
+});
+
+console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
+```
+
+```typescript title="WASM - Keyword Filtering"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Common stopwords to exclude
+const stopwords = new Set([
+  "the",
+  "a",
+  "an",
+  "and",
+  "or",
+  "but",
+  "in",
+  "on",
+  "at",
+  "to",
+  "for",
+  "of",
+  "with",
+  "from",
+  "by",
+  "is",
+  "are",
+  "was",
+  "were",
+  "be",
+  "been",
+  "have",
+  "has",
+  "had",
+  "do",
+  "does",
+  "did",
+  "will",
+  "would",
+  "could",
+  "should",
+]);
+
+// Extract and filter keywords
+const text = result.content.toLowerCase();
+const words = text.split(/\s+/);
+const keywordCounts = new Map<string, number>();
+
+words.forEach((word) => {
+  const cleaned = word.replace(/[^\w]/g, "");
+  if (cleaned.length > 4 && !stopwords.has(cleaned)) {
+    keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
+  }
+});
+
+// Get top keywords by frequency
+const topKeywords = Array.from(keywordCounts.entries())
+  .sort((a, b) => b[1] - a[1])
+  .slice(0, 10)
+  .map(([word, count]) => `${word} (${count})`);
+
+console.log(`Top keywords: ${topKeywords.join(", ")}`);
+```
+
+<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.
--- a/docs/snippets/wasm/advanced/keyword_extraction_example.md
+++ b/docs/snippets/wasm/advanced/keyword_extraction_example.md
@@ -0,0 +1,110 @@
+```typescript title="WASM - Extract and Score Keywords"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// TF-IDF style keyword extraction (simplified)
+interface Keyword {
+  term: string;
+  frequency: number;
+  uniquePositions: number[];
+  score: number;
+}
+
+const text = result.content.toLowerCase();
+const words = text.split(/[\s\n\t]+/);
+const tokenMap = new Map<string, number[]>();
+
+// Record word positions
+words.forEach((word, idx) => {
+  const cleaned = word.replace(/[^\w]/g, "");
+  if (cleaned.length > 3) {
+    if (!tokenMap.has(cleaned)) {
+      tokenMap.set(cleaned, []);
+    }
+    tokenMap.get(cleaned)!.push(idx);
+  }
+});
+
+// Calculate keyword scores
+const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
+  term,
+  frequency: positions.length,
+  uniquePositions: positions,
+  score: positions.length * Math.log(words.length / positions.length),
+}));
+
+// Sort by score (TF-IDF approximation)
+keywords.sort((a, b) => b.score - a.score);
+
+// Top 15 keywords
+const topKeywords = keywords.slice(0, 15);
+console.log("Top Keywords:");
+topKeywords.forEach((kw) => {
+  console.log(`  ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
+});
+```
+
+```typescript title="WASM - Keyword Context Window"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 200,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface KeywordContext {
+  keyword: string;
+  contexts: string[];
+}
+
+// Find keyword occurrences with surrounding context
+function extractKeywordContexts(
+  text: string,
+  keyword: string,
+  contextWindow: number = 50,
+): string[] {
+  const contexts: string[] = [];
+  const regex = new RegExp(keyword, "gi");
+  let match;
+
+  while ((match = regex.exec(text)) !== null) {
+    const start = Math.max(0, match.index - contextWindow);
+    const end = Math.min(text.length, match.index + keyword.length + contextWindow);
+    contexts.push(text.substring(start, end));
+  }
+
+  return contexts;
+}
+
+// Extract context for top keywords
+const topKeywords = ["document", "analysis", "results"];
+const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
+  keyword: kw,
+  contexts: extractKeywordContexts(result.content, kw, 40),
+}));
+
+keywordContexts.forEach((kc) => {
+  console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
+  kc.contexts.slice(0, 2).forEach((ctx, idx) => {
+    console.log(`  [${idx + 1}] ...${ctx}...`);
+  });
+});
+```
+
+<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.
--- a/docs/snippets/wasm/advanced/language_detection_config.md
+++ b/docs/snippets/wasm/advanced/language_detection_config.md
@@ -0,0 +1,84 @@
+```typescript title="WASM - Basic Language Detection"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.75,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Primary language: ${result.metadata?.language}`);
+console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
+console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
+```
+
+```typescript title="WASM - Multi-Language Detection"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.6,
+    detectMultiple: true, // Enable detection of multiple languages
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Primary language: ${result.metadata?.language}`);
+console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
+console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
+
+// Use detected language for downstream processing
+if (result.detectedLanguages && result.detectedLanguages.length > 1) {
+  console.log("Document contains multiple languages - enable multilingual NLP processing");
+}
+```
+
+```typescript title="WASM - Language-Specific Extraction"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: false,
+  },
+  // Adjust extraction parameters based on detected language
+  quality: {
+    enableQualityProcessing: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/plain", config);
+
+// Process result based on detected language
+const language = result.metadata?.language;
+console.log(`Processing document in language: ${language}`);
+
+// Example: Apply language-specific rules
+const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
+  en: { cleanWhitespace: true, normalizeText: true },
+  zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
+  ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
+  ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
+};
+
+const langConfig = languageConfig[language as string] || {
+  cleanWhitespace: true,
+  normalizeText: true,
+};
+console.log(`Language config: ${JSON.stringify(langConfig)}`);
+```
--- a/docs/snippets/wasm/advanced/language_detection_multilingual.md
+++ b/docs/snippets/wasm/advanced/language_detection_multilingual.md
@@ -0,0 +1,171 @@
+```typescript title="WASM - Detect and Process Multilingual Content"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.7,
+    detectMultiple: true,
+  },
+  chunking: {
+    maxChars: 800,
+    chunkOverlap: 200,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface MultilingualChunk {
+  index: number;
+  text: string;
+  language: string;
+  confidence: number;
+}
+
+// Detect language for each chunk
+const multilingualChunks: MultilingualChunk[] =
+  result.chunks?.map((chunk, idx) => {
+    // Simple language detection based on character ranges
+    const text = chunk.content;
+    let detectedLang = result.metadata?.language || "en";
+    let confidence = result.metadata?.languageConfidence || 0.5;
+
+    // Check for specific character patterns
+    if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
+    if (/[぀-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
+    if (/[가-힯]/.test(text)) detectedLang = "ko"; // Korean
+    if (/[؀-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
+    if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
+
+    return {
+      index: idx,
+      text: text.substring(0, 50),
+      language: detectedLang,
+      confidence: confidence,
+    };
+  }) || [];
+
+// Group chunks by language
+const chunksByLanguage = new Map<string, MultilingualChunk[]>();
+multilingualChunks.forEach((chunk) => {
+  if (!chunksByLanguage.has(chunk.language)) {
+    chunksByLanguage.set(chunk.language, []);
+  }
+  chunksByLanguage.get(chunk.language)!.push(chunk);
+});
+
+console.log("Chunks by detected language:");
+chunksByLanguage.forEach((chunks, lang) => {
+  console.log(`  ${lang}: ${chunks.length} chunks`);
+  chunks.slice(0, 2).forEach((c) => {
+    console.log(`    Chunk ${c.index}: "${c.text}..."`);
+  });
+});
+```
+
+```typescript title="WASM - Language-Specific Text Processing"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    detectMultiple: true,
+  },
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/plain", config);
+
+// Language-specific text normalization
+interface LanguageProcessor {
+  normalize: (text: string) => string;
+  splitSentences: (text: string) => string[];
+}
+
+const processors: Record<string, LanguageProcessor> = {
+  en: {
+    normalize: (t) => t.replace(/\s+/g, " ").trim(),
+    splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
+  },
+  zh: {
+    normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
+    splitSentences: (t) => t.split(/[。！？]+/).filter((s) => s.length > 0),
+  },
+  ja: {
+    normalize: (t) => t.replace(/\s+/g, "").trim(),
+    splitSentences: (t) => t.split(/[。！？]+/).filter((s) => s.length > 0),
+  },
+  ar: {
+    normalize: (t) => t.replace(/\s+/g, " ").trim(),
+    splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
+  },
+};
+
+const language = result.metadata?.language || "en";
+const processor = processors[language] || processors["en"];
+
+const normalized = processor.normalize(result.content);
+const sentences = processor.splitSentences(result.content);
+
+console.log(`Language: ${language}`);
+console.log(`Normalized length: ${normalized.length}`);
+console.log(`Detected sentences: ${sentences.length}`);
+sentences.slice(0, 3).forEach((sent, idx) => {
+  console.log(`  [${idx + 1}] ${sent.substring(0, 60)}...`);
+});
+```
+
+```typescript title="WASM - Multilingual Chunking Strategy"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface LanguageChunkConfig {
+  maxChars: number;
+  overlap: number;
+}
+
+// Different chunking strategies per language
+const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
+  en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
+  zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
+  ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
+  ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
+  default: { maxChars: 512, overlap: 128 },
+};
+
+// Detect language first
+const languageDetectConfig = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
+const detectedLang = langResult.metadata?.language || "en";
+
+// Re-extract with language-specific chunking
+const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
+const finalConfig = {
+  languageDetection: {
+    enabled: true,
+  },
+  chunking: {
+    maxChars: chunkConfig.maxChars,
+    chunkOverlap: chunkConfig.overlap,
+  },
+};
+
+const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
+console.log(`Language: ${detectedLang}`);
+console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
+console.log(`Generated ${finalResult.chunks?.length} chunks`);
+```
--- a/docs/snippets/wasm/advanced/memory-management.ts
+++ b/docs/snippets/wasm/advanced/memory-management.ts
@@ -0,0 +1,37 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function processLargeDocumentSet(files: File[]) {
+  await initWasm();
+
+  const BATCH_SIZE = 5;
+  const results: any[] = [];
+
+  for (let i = 0; i < files.length; i += BATCH_SIZE) {
+    const batch = files.slice(i, i + BATCH_SIZE);
+
+    console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
+
+    const batchResults = await Promise.all(
+      batch.map(async (file) => {
+        const arrayBuffer = await file.arrayBuffer();
+        const bytes = new Uint8Array(arrayBuffer);
+        return extractBytes(bytes, file.type);
+      }),
+    );
+
+    results.push(...batchResults);
+
+    if (global.gc) {
+      console.log("Running garbage collection");
+      global.gc();
+    }
+
+    await new Promise((resolve) => setTimeout(resolve, 100));
+  }
+
+  return results;
+}
+
+processLargeDocumentSet([])
+  .then(() => console.log("Done"))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/parallel-extraction.ts
+++ b/docs/snippets/wasm/advanced/parallel-extraction.ts
@@ -0,0 +1,43 @@
+import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
+
+async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
+  await initWasm();
+
+  const _runtime = detectRuntime();
+  const canUseWorkers = hasWorkers();
+
+  if (isBrowser() && canUseWorkers) {
+    return extractWithWebWorkers(documents, mimeTypes);
+  }
+
+  return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
+}
+
+function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
+  const workerCount = navigator.hardwareConcurrency ?? 2;
+  const workers: Worker[] = [];
+
+  for (let i = 0; i < workerCount; i++) {
+    workers.push(new Worker("extraction-worker.js"));
+  }
+
+  return Promise.all(
+    documents.map(
+      (bytes, index) =>
+        new Promise((resolve, reject) => {
+          const worker = workers[index % workers.length];
+          worker.postMessage({ bytes, mimeType: mimeTypes[index] });
+          worker.onmessage = (e) => resolve(e.data);
+          worker.onerror = reject;
+        }),
+    ),
+  );
+}
+
+function isBrowser() {
+  return typeof window !== "undefined";
+}
+
+extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
+  .then((results) => console.log(results))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/quality_processing_config.md
+++ b/docs/snippets/wasm/advanced/quality_processing_config.md
@@ -0,0 +1,103 @@
+```typescript title="WASM - Enable Quality Processing"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+console.log(`Content: ${result.content.substring(0, 100)}...`);
+
+// Quality score indicates text extraction quality (0.0-1.0)
+if (result.qualityScore && result.qualityScore < 0.5) {
+  console.warn("Low quality extraction detected - consider OCR or alternative processing");
+}
+```
+
+```typescript title="WASM - Quality Monitoring"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface ExtractionQuality {
+  contentLength: number;
+  qualityScore: number | null;
+  assessedAs: string;
+}
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const quality: ExtractionQuality = {
+  contentLength: result.content.length,
+  qualityScore: result.qualityScore || null,
+  assessedAs: result.qualityScore
+    ? result.qualityScore > 0.8
+      ? "high"
+      : result.qualityScore > 0.5
+        ? "medium"
+        : "low"
+    : "unknown",
+};
+
+console.log("Extraction Quality Report:");
+console.log(`  Content size: ${quality.contentLength} bytes`);
+console.log(`  Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
+console.log(`  Assessment: ${quality.assessedAs}`);
+
+if (quality.assessedAs === "low") {
+  console.log(
+    "  Recommendation: Review raw text for encoding issues or consider alternative extraction",
+  );
+}
+```
+
+```typescript title="WASM - Quality with OCR Fallback"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+async function extractWithQualityCheck(
+  bytes: Uint8Array,
+  mimeType: string,
+): Promise<{ content: string; quality: number | null; method: string }> {
+  const config = {
+    enableQualityProcessing: true,
+  };
+
+  const result = await extractBytes(bytes, mimeType, config);
+  const qualityScore = result.qualityScore || 0;
+
+  // If quality is low, consider text extraction failed or use OCR
+  if (qualityScore < 0.5) {
+    console.warn("Low quality text extraction - alternative processing recommended");
+    return {
+      content: result.content,
+      quality: qualityScore,
+      method: "degraded-text-extraction",
+    };
+  }
+
+  return {
+    content: result.content,
+    quality: qualityScore,
+    method: "text-extraction",
+  };
+}
+
+const bytes = new Uint8Array(buffer);
+const extracted = await extractWithQualityCheck(bytes, "application/pdf");
+
+console.log(`Extraction method: ${extracted.method}`);
+console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
+console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
+```
--- a/docs/snippets/wasm/advanced/quality_processing_example.md
+++ b/docs/snippets/wasm/advanced/quality_processing_example.md
@@ -0,0 +1,161 @@
+```typescript title="WASM - Assess Text Quality"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface TextQualityMetrics {
+  contentLength: number;
+  lineCount: number;
+  averageLineLength: number;
+  emptyLineRatio: number;
+  specialCharRatio: number;
+  estimatedLanguages: string[];
+}
+
+function assessTextQuality(content: string): TextQualityMetrics {
+  const lines = content.split(/\n+/);
+  const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
+  const totalChars = content.length;
+  const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
+
+  // Simple language detection by character patterns
+  const detectedLangs: string[] = [];
+  if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
+  if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
+  if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
+  if (/[가-힯]/.test(content)) detectedLangs.push("ko");
+
+  return {
+    contentLength: totalChars,
+    lineCount: lines.length,
+    averageLineLength:
+      nonEmptyLines.length > 0
+        ? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
+        : 0,
+    emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
+    specialCharRatio: specialChars / totalChars,
+    estimatedLanguages: detectedLangs,
+  };
+}
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const metrics = assessTextQuality(result.content);
+
+console.log("Text Quality Assessment:");
+console.log(`  Length: ${metrics.contentLength} characters`);
+console.log(
+  `  Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
+);
+console.log(`  Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
+console.log(`  Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
+console.log(`  Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
+console.log(`  Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+```
+
+```typescript title="WASM - Quality-Based Content Filtering"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 200,
+    trim: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface QualityFilteredChunk {
+  index: number;
+  content: string;
+  quality: number;
+  kept: boolean;
+}
+
+// Filter chunks based on quality heuristics
+const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
+const filteredChunks: QualityFilteredChunk[] =
+  result.chunks?.map((chunk, idx) => {
+    const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
+    const hasNumbers = /\d/.test(chunk.content);
+    const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
+
+    // Quality score based on content characteristics
+    const contentQuality =
+      (nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
+    const kept = contentQuality >= qualityThreshold;
+
+    return {
+      index: idx,
+      content: chunk.content.substring(0, 50),
+      quality: contentQuality,
+      kept,
+    };
+  }) || [];
+
+const keptChunks = filteredChunks.filter((c) => c.kept);
+console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
+
+keptChunks.slice(0, 3).forEach((c) => {
+  console.log(`  Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
+});
+```
+
+```typescript title="WASM - Content Encoding Validation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Validate text encoding quality
+interface EncodingValidation {
+  hasInvalidChars: boolean;
+  invalidCharCount: number;
+  replacementRatio: number;
+  estimatedEncoding: string;
+}
+
+function validateEncoding(content: string): EncodingValidation {
+  // Check for replacement characters (U+FFFD)
+  const replacementChars = (content.match(/<2F>/g) || []).length;
+  const hasInvalidChars = replacementChars > 0;
+  const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
+
+  // Guess encoding based on content patterns
+  const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
+
+  return {
+    hasInvalidChars,
+    invalidCharCount: replacementChars,
+    replacementRatio,
+    estimatedEncoding,
+  };
+}
+
+const validation = validateEncoding(result.content);
+
+console.log("Content Encoding Validation:");
+console.log(`  Estimated encoding: ${validation.estimatedEncoding}`);
+console.log(`  Invalid characters: ${validation.invalidCharCount}`);
+console.log(`  Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
+console.log(
+  `  Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
+);
+console.log(`  Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+```
--- a/docs/snippets/wasm/advanced/streaming-extraction.ts
+++ b/docs/snippets/wasm/advanced/streaming-extraction.ts
@@ -0,0 +1,39 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractStreamingDocument(url: string) {
+  await initWasm();
+
+  const response = await fetch(url);
+  if (!response.ok) throw new Error(`HTTP ${response.status}`);
+
+  const reader = response.body?.getReader();
+  if (!reader) throw new Error("No response body");
+
+  const chunks: Uint8Array[] = [];
+  let totalSize = 0;
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    chunks.push(value);
+    totalSize += value.length;
+    console.log(`Received ${totalSize} bytes`);
+  }
+
+  const fullBuffer = new Uint8Array(totalSize);
+  let offset = 0;
+  for (const chunk of chunks) {
+    fullBuffer.set(chunk, offset);
+    offset += chunk.length;
+  }
+
+  console.log("Document fully received, extracting...");
+  const result = await extractBytes(fullBuffer, "application/pdf");
+
+  return result;
+}
+
+extractStreamingDocument("https://example.com/document.pdf")
+  .then((r) => console.log(r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/token_reduction_config.md
+++ b/docs/snippets/wasm/advanced/token_reduction_config.md
@@ -0,0 +1,75 @@
+```typescript title="WASM - Token Reduction Configuration"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "aggressive",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Original content: ${result.content.length} characters`);
+console.log(`Preview: ${result.content.substring(0, 100)}...`);
+
+// Token reduction modes:
+// - "aggressive": maximum reduction
+// - "balanced": moderate reduction
+// - "conservative": minimal reduction
+```
+
+```typescript title="WASM - Token Reduction Modes"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface TokenReductionResult {
+  mode: string;
+  originalSize: number;
+  reducedSize: number;
+  reductionRatio: number;
+  preview: string;
+}
+
+async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
+  const modes = ["conservative", "balanced", "aggressive"];
+  const results: TokenReductionResult[] = [];
+
+  for (const mode of modes) {
+    const config = {
+      tokenReduction: {
+        mode,
+        preserveImportantWords: true,
+      },
+    };
+
+    const result = await extractBytes(bytes, "application/pdf", config);
+    const originalSize = result.content.length;
+    const reducedSize = result.content.split(/\s+/).length;
+
+    results.push({
+      mode,
+      originalSize,
+      reducedSize,
+      reductionRatio: 1 - reducedSize / originalSize,
+      preview: result.content.substring(0, 80),
+    });
+  }
+
+  return results;
+}
+
+const bytes = new Uint8Array(buffer);
+const modeComparison = await compareTokenReductionModes(bytes);
+
+console.log("Token Reduction Mode Comparison:");
+modeComparison.forEach((r) => {
+  console.log(`  ${r.mode}:`);
+  console.log(`    Original: ${r.originalSize} chars`);
+  console.log(`    Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
+});
+```
--- a/docs/snippets/wasm/advanced/token_reduction_example.md
+++ b/docs/snippets/wasm/advanced/token_reduction_example.md
@@ -0,0 +1,149 @@
+```typescript title="WASM - Token Counting and Cost Estimation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "balanced",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Simple token counting (approximation: 1 token ≈ 4 chars)
+function estimateTokenCount(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+// LLM pricing (example: GPT-4 Turbo)
+interface PricingEstimate {
+  tokenCount: number;
+  inputCost: number;
+  outputCostEstimate: number;
+  totalEstimate: number;
+}
+
+const tokenCount = estimateTokenCount(result.content);
+const inputPricePerToken = 0.00001; // $0.01/1K tokens
+const outputPricePerToken = 0.00003; // $0.03/1K tokens
+
+const costEstimate: PricingEstimate = {
+  tokenCount,
+  inputCost: tokenCount * inputPricePerToken,
+  outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
+  totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
+};
+
+console.log("Token and Cost Analysis:");
+console.log(`  Estimated tokens: ${costEstimate.tokenCount}`);
+console.log(`  Input cost: $${costEstimate.inputCost.toFixed(6)}`);
+console.log(`  Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
+console.log(`  Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
+```
+
+```typescript title="WASM - Token Reduction for Context Windows"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface ContextWindowFit {
+  contentLength: number;
+  estimatedTokens: number;
+  fitsInWindow: boolean;
+  utilization: number;
+}
+
+function checkContextWindowFit(
+  content: string,
+  contextWindowSize: number = 4096,
+): ContextWindowFit {
+  const estimatedTokens = Math.ceil(content.length / 4);
+  const fitsInWindow = estimatedTokens < contextWindowSize;
+  const utilization = estimatedTokens / contextWindowSize;
+
+  return {
+    contentLength: content.length,
+    estimatedTokens,
+    fitsInWindow,
+    utilization,
+  };
+}
+
+const config = {
+  tokenReduction: {
+    mode: "aggressive", // Use aggressive mode for large documents
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const contextFit = checkContextWindowFit(result.content, 4096);
+
+console.log("Context Window Analysis:");
+console.log(`  Content: ${contextFit.contentLength} characters`);
+console.log(`  Tokens (est.): ${contextFit.estimatedTokens}`);
+console.log(`  Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
+console.log(`  Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
+
+if (!contextFit.fitsInWindow) {
+  console.log("  Note: Consider chunking or more aggressive token reduction");
+}
+```
+
+```typescript title="WASM - Selective Token Preservation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "balanced",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Extract important terms manually
+interface ImportantTerm {
+  term: string;
+  frequency: number;
+  importance: number;
+}
+
+function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
+  const words = content.toLowerCase().split(/\s+/);
+  const frequencyMap = new Map<string, number>();
+
+  words.forEach((word) => {
+    const cleaned = word.replace(/[^\w]/g, "");
+    if (cleaned.length > 5) {
+      // Only consider longer words
+      frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
+    }
+  });
+
+  return Array.from(frequencyMap.entries())
+    .filter(([_, freq]) => freq >= threshold)
+    .map(([term, freq]) => ({
+      term,
+      frequency: freq,
+      importance: Math.log(freq) * (term.length / 10),
+    }))
+    .sort((a, b) => b.importance - a.importance)
+    .slice(0, 20);
+}
+
+const importantTerms = extractImportantTerms(result.content);
+
+console.log("Important Terms (likely preserved by token reduction):");
+importantTerms.forEach((t) => {
+  console.log(`  "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
+});
+```
--- a/docs/snippets/wasm/advanced/worker-extraction.ts
+++ b/docs/snippets/wasm/advanced/worker-extraction.ts
@@ -0,0 +1,57 @@
+class ExtractionWorker {
+  private workerPool: Worker[] = [];
+  private taskQueue: Array<{
+    bytes: Uint8Array;
+    mimeType: string;
+    resolve: (value: any) => void;
+    reject: (error: any) => void;
+  }> = [];
+
+  constructor(workerCount?: number) {
+    const count =
+      workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
+    for (let i = 0; i < count; i++) {
+      const worker = new Worker("extraction-worker.js");
+      worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
+      worker.onerror = (e) => this.handleWorkerError(worker, e);
+      this.workerPool.push(worker);
+    }
+  }
+
+  async extract(bytes: Uint8Array, mimeType: string) {
+    return new Promise((resolve, reject) => {
+      this.taskQueue.push({ bytes, mimeType, resolve, reject });
+      this.processTasks();
+    });
+  }
+
+  private processTasks() {
+    const availableWorker = this.workerPool[0];
+    const task = this.taskQueue.shift();
+
+    if (!task || !availableWorker) return;
+
+    availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
+    this.workerPool.push(this.workerPool.shift()!);
+  }
+
+  private handleWorkerMessage(_worker: Worker, result: any) {
+    const task = this.taskQueue.shift();
+    if (task) {
+      task.resolve(result);
+      this.processTasks();
+    }
+  }
+
+  private handleWorkerError(_worker: Worker, error: any) {
+    const task = this.taskQueue.shift();
+    if (task) {
+      task.reject(error);
+      this.processTasks();
+    }
+  }
+
+  terminate() {
+    this.workerPool.forEach((w) => w.terminate());
+  }
+}
--- a/docs/snippets/wasm/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/wasm/api/batch_extract_bytes_sync.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const urls = ["document1.pdf", "document2.pdf"];
+
+const results = await Promise.all(
+  urls.map(async (url) => {
+    const resp = await fetch(url);
+    const bytes = new Uint8Array(await resp.arrayBuffer());
+    return extractBytes(bytes, "application/pdf", undefined);
+  }),
+);
+
+results.forEach((result, i) => {
+  console.log(`Document ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/wasm/api/batch_extract_files_sync.md
+++ b/docs/snippets/wasm/api/batch_extract_files_sync.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const input = document.getElementById("files") as HTMLInputElement;
+const files = Array.from(input.files ?? []);
+
+const results = await Promise.all(
+  files.map(async (file) => {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    return extractBytes(bytes, file.type || "application/pdf", undefined);
+  }),
+);
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/wasm/api/client_chunk_text.md
+++ b/docs/snippets/wasm/api/client_chunk_text.md
@@ -0,0 +1,28 @@
+```typescript title="WASM"
+// HTTP client approach for chunking text via the REST API
+// Useful in browsers where WASM extraction is called server-side
+
+const text = "This is a long document that needs to be split into semantic chunks.";
+
+const response = await fetch("http://localhost:8000/chunk", {
+  method: "POST",
+  headers: { "Content-Type": "application/json" },
+  body: JSON.stringify({
+    text,
+    chunker_type: "text",
+    config: {
+      chunking: {
+        strategy: "semantic",
+        max_chunk_size: 512,
+        overlap: 50,
+      },
+    },
+  }),
+});
+
+const result = await response.json();
+console.log(`Created ${result.chunks?.length ?? 0} chunks`);
+result.chunks?.forEach((chunk) => {
+  console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
+});
+```
--- a/docs/snippets/wasm/api/client_extract_single_file.md
+++ b/docs/snippets/wasm/api/client_extract_single_file.md
@@ -0,0 +1,22 @@
+```typescript title="WASM"
+// HTTP client approach for file extraction via multipart upload
+// Use this when uploading files from a browser form
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const formData = new FormData();
+  formData.append("file", file);
+  formData.append("mime_type", file.type || "application/octet-stream");
+
+  const response = await fetch("http://localhost:8000/extract/file", {
+    method: "POST",
+    body: formData,
+  });
+
+  const result = await response.json();
+  console.log(`Extracted ${result.content.length} characters`);
+  console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
+}
+```
--- a/docs/snippets/wasm/api/combining_all_features.md
+++ b/docs/snippets/wasm/api/combining_all_features.md
@@ -0,0 +1,69 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+
+  // Build a comprehensive extraction config
+  const config = {
+    use_cache: true,
+    enable_quality_processing: true,
+    output_format: "markdown",
+    include_document_structure: true,
+
+    // Chunking configuration
+    chunking: {
+      strategy: "semantic",
+      max_chunk_size: 1024,
+      overlap: 100,
+    },
+
+    // Image extraction configuration
+    images: {
+      extract_images: true,
+      extract_base64: false,
+      extract_raw_bytes: false,
+    },
+
+    // OCR configuration
+    ocr: {
+      backend: "tesseract",
+      languages: ["eng"],
+      enabled: true,
+    },
+
+    // HTML-specific extraction options
+    html_options: "article, main, .content",
+
+    // PDF-specific options
+    pdf_options: {
+      ocr_strategy: "auto",
+      preserve_images: true,
+    },
+
+    // Security limits
+    security_limits: {
+      max_archive_size: 524288000,
+      max_file_count: 10000,
+      max_compression_ratio: 100,
+    },
+  };
+
+  try {
+    const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
+    console.log(`Content: ${result.content.substring(0, 100)}...`);
+    console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
+    console.log(`Chunks: ${result.chunks?.length ?? 0}`);
+    if (result.images?.length) {
+      console.log(`Images: ${result.images.length}`);
+    }
+  } catch (err) {
+    console.error("Extraction failed:", err);
+  }
+}
+```
--- a/docs/snippets/wasm/api/error_handling.md
+++ b/docs/snippets/wasm/api/error_handling.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  try {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+    console.log(`Extracted: ${result.content.length} characters`);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    console.error("Extraction failed:", message);
+  }
+}
+```
--- a/docs/snippets/wasm/api/error_handling_extract.md
+++ b/docs/snippets/wasm/api/error_handling_extract.md
@@ -0,0 +1,40 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Note: WASM has no native batch API; use Promise.all with per-item error handling
+const files = document.getElementById("files") as HTMLInputElement;
+const fileList = files.files || [];
+
+// Extract multiple files concurrently (simulated batch)
+const extractionPromises = Array.from(fileList).map(async (file) => {
+  try {
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
+    return { file: file.name, success: true, result };
+  } catch (err) {
+    return {
+      file: file.name,
+      success: false,
+      error: err instanceof Error ? err.message : String(err),
+    };
+  }
+});
+
+const results = await Promise.all(extractionPromises);
+
+// Process results with per-item error handling
+results.forEach((item) => {
+  if (item.success) {
+    console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
+  } else {
+    console.error(`✗ ${item.file}: ${item.error}`);
+  }
+});
+
+// Summary
+const succeeded = results.filter((r) => r.success).length;
+const failed = results.filter((r) => !r.success).length;
+console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
+```
--- a/docs/snippets/wasm/api/error_handling_wasm.md
+++ b/docs/snippets/wasm/api/error_handling_wasm.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+try {
+  const result = await extractBytes(data, "application/pdf", undefined);
+  console.log(`Success: ${result.content.length} characters`);
+} catch (error) {
+  if (error instanceof Error) {
+    console.error("Extraction error:", error.message);
+  }
+}
+```
--- a/docs/snippets/wasm/api/extract_bytes_async.md
+++ b/docs/snippets/wasm/api/extract_bytes_async.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(`Extracted: ${result.content.length} characters`);
+console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+```
--- a/docs/snippets/wasm/api/extract_bytes_sync.md
+++ b/docs/snippets/wasm/api/extract_bytes_sync.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+// WASM exposes only async extractBytes; await it from any async context.
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/api/extract_file_async.md
+++ b/docs/snippets/wasm/api/extract_file_async.md
@@ -0,0 +1,14 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+  console.log(`Content length: ${result.content.length} characters`);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+```
--- a/docs/snippets/wasm/api/extract_file_sync.md
+++ b/docs/snippets/wasm/api/extract_file_sync.md
@@ -0,0 +1,15 @@
+```typescript title="WASM"
+// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+  console.log(result.content);
+  console.log(`Tables: ${result.tables?.length ?? 0}`);
+}
+```
--- a/docs/snippets/wasm/api/tables.md
+++ b/docs/snippets/wasm/api/tables.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const bytes = new Uint8Array(await file.arrayBuffer());
+  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
+
+  result.tables?.forEach((table) => {
+    console.log(`Table with ${table.cells?.length ?? 0} rows`);
+    if (table.markdown) {
+      console.log(table.markdown);
+    }
+    table.cells?.forEach((row) => console.log(row.join(" | ")));
+  });
+}
+```
--- a/docs/snippets/wasm/cache/ocr-cache.ts
+++ b/docs/snippets/wasm/cache/ocr-cache.ts
@@ -0,0 +1,30 @@
+import { extractBytes, initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
+
+async function demonstrateOcrCaching() {
+  await initWasm();
+
+  const backend = new TesseractWasmBackend();
+  await backend.initialize();
+
+  console.log("Tesseract WASM backend loaded - models cached");
+
+  const imageBytes = new Uint8Array(await fetch("page1.png").then((r) => r.arrayBuffer()));
+
+  console.time("First OCR (with model load)");
+  const _result1 = await extractBytes(imageBytes, "image/png", {
+    ocr: { backend: "tesseract-wasm", language: "eng" },
+  });
+  console.timeEnd("First OCR (with model load)");
+
+  console.log("Model cached in memory");
+
+  const imageBytes2 = new Uint8Array(await fetch("page2.png").then((r) => r.arrayBuffer()));
+
+  console.time("Second OCR (model cached)");
+  const _result2 = await extractBytes(imageBytes2, "image/png", {
+    ocr: { backend: "tesseract-wasm", language: "eng" },
+  });
+  console.timeEnd("Second OCR (model cached)");
+}
+
+demonstrateOcrCaching().catch(console.error);
--- a/docs/snippets/wasm/cache/result-caching.ts
+++ b/docs/snippets/wasm/cache/result-caching.ts
@@ -0,0 +1,57 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+class ExtractionCache {
+  private cache = new Map<string, ExtractionResult>();
+  private fileHashes = new Map<File, string>();
+
+  async getHash(file: File): Promise<string> {
+    if (this.fileHashes.has(file)) {
+      return this.fileHashes.get(file)!;
+    }
+
+    const buffer = await file.arrayBuffer();
+    const hashBuffer = await crypto.subtle.digest("SHA-256", buffer);
+    const hashArray = Array.from(new Uint8Array(hashBuffer));
+    const hashStr = hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
+
+    this.fileHashes.set(file, hashStr);
+    return hashStr;
+  }
+
+  async extract(file: File): Promise<ExtractionResult> {
+    const hash = await this.getHash(file);
+
+    if (this.cache.has(hash)) {
+      console.log("Cache hit for", file.name);
+      return this.cache.get(hash)!;
+    }
+
+    console.log("Cache miss for", file.name);
+    const bytes = new Uint8Array(await file.arrayBuffer());
+    const result = await extractBytes(bytes, file.type);
+
+    this.cache.set(hash, result);
+    return result;
+  }
+
+  clear() {
+    this.cache.clear();
+    this.fileHashes.clear();
+  }
+
+  getSize() {
+    return this.cache.size;
+  }
+}
+
+async function demonstrateCaching() {
+  await initWasm();
+
+  const cache = new ExtractionCache();
+
+  const _result = await cache.extract(new File([], "test.pdf"));
+  console.log("Cache size:", cache.getSize());
+}
+
+demonstrateCaching().catch(console.error);
--- a/docs/snippets/wasm/cache/session-storage.ts
+++ b/docs/snippets/wasm/cache/session-storage.ts
@@ -0,0 +1,43 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+async function _cacheResultInSessionStorage(file: File): Promise<ExtractionResult> {
+  await initWasm();
+
+  const cacheKey = `extraction_${file.name}_${file.size}`;
+
+  const cached = sessionStorage.getItem(cacheKey);
+  if (cached) {
+    console.log("Loading from session storage");
+    return JSON.parse(cached);
+  }
+
+  console.log("Extracting and caching result");
+  const result = await extractFromFile(file);
+
+  try {
+    sessionStorage.setItem(cacheKey, JSON.stringify(result));
+  } catch (error) {
+    if (error instanceof Error && error.name === "QuotaExceededError") {
+      console.warn("Session storage full, skipping cache");
+    }
+  }
+
+  return result;
+}
+
+async function clearExtractionCache() {
+  const keys = Object.keys(sessionStorage);
+  let cleared = 0;
+
+  for (const key of keys) {
+    if (key.startsWith("extraction_")) {
+      sessionStorage.removeItem(key);
+      cleared++;
+    }
+  }
+
+  console.log(`Cleared ${cleared} cached results`);
+}
+
+clearExtractionCache();
--- a/docs/snippets/wasm/config/advanced_config.md
+++ b/docs/snippets/wasm/config/advanced_config.md
@@ -0,0 +1,35 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 100,
+  },
+  enable_language_detection: true,
+  enable_quality: true,
+};
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file, file.type, config);
+
+  if (result.chunks) {
+    for (const chunk of result.chunks) {
+      console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
+    }
+  }
+
+  if (result.detectedLanguages) {
+    console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
+  }
+}
+```
--- a/docs/snippets/wasm/config/basic-config.ts
+++ b/docs/snippets/wasm/config/basic-config.ts
@@ -0,0 +1,29 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "eng",
+    },
+    images: {
+      extractImages: true,
+      targetDpi: 200,
+    },
+    chunking: {
+      maxChars: 1000,
+      chunkOverlap: 100,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+  console.log("Extraction complete");
+  console.log("Content length:", result.content.length);
+}
+
+extractWithConfig().catch(console.error);
--- a/docs/snippets/wasm/config/chunking-config.ts
+++ b/docs/snippets/wasm/config/chunking-config.ts
@@ -0,0 +1,66 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithChunking() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    chunking: {
+      maxChars: 800,
+      chunkOverlap: 150,
+      splitOnNewlines: true,
+      splitOnSentences: true,
+    },
+  };
+
+  // Example: prepend heading context so each chunk carries its heading breadcrumb
+  const configWithHeadings: ExtractionConfig = {
+    chunking: {
+      chunkerType: "markdown",
+      maxChars: 800,
+      prependHeadingContext: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  if (result.chunks) {
+    console.log(`Total chunks: ${result.chunks.length}`);
+
+    result.chunks.slice(0, 3).forEach((chunk, i) => {
+      console.log(`\nChunk ${i}:`);
+      console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
+      console.log(`Content: ${chunk.content.substring(0, 100)}...`);
+    });
+  }
+}
+
+async function extractWithPrependHeadingContext() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    chunking: {
+      chunkerType: "markdown",
+      maxChars: 800,
+      prependHeadingContext: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "text/markdown", config);
+
+  if (result.chunks) {
+    console.log(`Total chunks: ${result.chunks.length}`);
+
+    result.chunks.slice(0, 3).forEach((chunk, i) => {
+      // Each chunk's content is prefixed with its heading breadcrumb
+      console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
+    });
+  }
+}
+
+extractWithChunking().catch(console.error);
+extractWithPrependHeadingContext().catch(console.error);
--- a/docs/snippets/wasm/config/chunking_config.md
+++ b/docs/snippets/wasm/config/chunking_config.md
@@ -0,0 +1,71 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 100,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
+  console.log(`Tokens: ${chunk.metadata?.token_count}`);
+});
+```
+
+```typescript title="WASM - Markdown with Heading Context"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 2000,
+    // Note: Token-based sizing is not available in WASM builds.
+    // Use character-based sizing instead.
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
+
+  if (chunk.metadata?.headingContext?.headings) {
+    console.log("Headings:");
+    chunk.metadata.headingContext.headings.forEach((h) => {
+      console.log(`  Level ${h.level}: ${h.text}`);
+    });
+  }
+});
+```
+
+```typescript title="WASM - Prepend Heading Context"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 2000,
+    prependHeadingContext: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  // Each chunk's content is prefixed with its heading breadcrumb
+  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
+});
+```
--- a/docs/snippets/wasm/config/combined-config.ts
+++ b/docs/snippets/wasm/config/combined-config.ts
@@ -0,0 +1,34 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithFullConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "deu",
+    },
+    chunking: {
+      maxChars: 1000,
+      chunkOverlap: 200,
+    },
+    images: {
+      extractImages: true,
+      targetDpi: 200,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  console.log("=== Extraction Results ===");
+  console.log(`Content: ${result.content.length} chars`);
+  console.log(`Chunks: ${result.chunks?.length ?? 0}`);
+  console.log(`Images: ${result.images?.length ?? 0}`);
+  console.log(`Tables: ${result.tables.length}`);
+  console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
+}
+
+extractWithFullConfig().catch(console.error);
--- a/docs/snippets/wasm/config/conditional-config.ts
+++ b/docs/snippets/wasm/config/conditional-config.ts
@@ -0,0 +1,34 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithConditionalConfig(fileSize: number) {
+  await initWasm();
+
+  const config: ExtractionConfig = {};
+
+  if (fileSize > 10 * 1024 * 1024) {
+    config.chunking = {
+      maxChars: 500,
+      chunkOverlap: 50,
+    };
+  }
+
+  if (fileSize < 1 * 1024 * 1024) {
+    config.images = {
+      extractImages: true,
+      targetDpi: 300,
+    };
+  }
+
+  config.ocr = {
+    enabled: fileSize < 50 * 1024 * 1024,
+  };
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  return result;
+}
+
+extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));
--- a/docs/snippets/wasm/config/config_basic.md
+++ b/docs/snippets/wasm/config/config_basic.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+  images: {
+    extractImages: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/config/config_discover.md
+++ b/docs/snippets/wasm/config/config_discover.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import { initWasm, extractBytes } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const config = {
+  use_cache: true,
+  enable_quality_processing: true,
+  ocr: {
+    backend: "tesseract-wasm",
+    language: "eng",
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/config/config_ocr.md
+++ b/docs/snippets/wasm/config/config_ocr.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Content length: ${result.content.length}`);
+console.log(`Tables detected: ${result.tables?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/config_programmatic.md
+++ b/docs/snippets/wasm/config/config_programmatic.md
@@ -0,0 +1,26 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  use_cache: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseract_config: {
+      psm: 6,
+    },
+  },
+  chunking: {
+    max_characters: 1000,
+    overlap: 200,
+  },
+  enable_quality_processing: true,
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Content length: ${result.content.length}`);
+```
--- a/docs/snippets/wasm/config/document_structure_config.md
+++ b/docs/snippets/wasm/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```typescript title="Document Structure Config (WASM)"
+import { extractBytes } from "kreuzberg-wasm";
+
+const config = {
+  includeDocumentStructure: true,
+};
+
+const result = extractBytes(fileBuffer, "application/pdf", config);
+
+if (result.document) {
+  for (const node of result.document.nodes) {
+    console.log(`[${node.content.nodeType}]`);
+  }
+}
+```
--- a/docs/snippets/wasm/config/element_based_output.md
+++ b/docs/snippets/wasm/config/element_based_output.md
@@ -0,0 +1,35 @@
+```typescript title="Element-Based Output (WASM)"
+import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
+
+// Configure element-based output
+const config: ExtractionConfig = {
+  outputFormat: "element_based",
+};
+
+// Extract document
+const result = extractFileSync(fileBuffer, "application/pdf", config);
+
+// Access elements
+for (const element of result.elements) {
+  console.log(`Type: ${element.elementType}`);
+  console.log(`Text: ${element.text.slice(0, 100)}`);
+
+  if (element.metadata.pageNumber) {
+    console.log(`Page: ${element.metadata.pageNumber}`);
+  }
+
+  if (element.metadata.coordinates) {
+    const coords = element.metadata.coordinates;
+    console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
+  }
+
+  console.log("---");
+}
+
+// Filter by element type
+const titles = result.elements.filter((e) => e.elementType === "title");
+for (const title of titles) {
+  const level = title.metadata.additional?.level || "unknown";
+  console.log(`[${level}] ${title.text}`);
+}
+```
--- a/docs/snippets/wasm/config/hierarchy_config.ts
+++ b/docs/snippets/wasm/config/hierarchy_config.ts
@@ -0,0 +1,105 @@
+import {
+  type ExtractionConfig,
+  type HierarchyConfig,
+  Kreuzberg,
+  type PdfConfig,
+} from "kreuzberg-wasm";
+
+// Example 1: Basic hierarchy extraction
+// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
+// Extract bounding box information for spatial layout awareness.
+const hierarchyConfigBasic: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
+  includeBbox: true, // Include bounding box coordinates
+  ocrCoverageThreshold: undefined, // No OCR coverage threshold
+};
+
+const pdfConfigBasic: PdfConfig = {
+  hierarchy: hierarchyConfigBasic,
+};
+
+const extractionConfigBasic: ExtractionConfig = {
+  pdfOptions: pdfConfigBasic,
+};
+
+// const kreuzberg = new Kreuzberg(extractionConfigBasic);
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Example 2: Custom kClusters for minimal structure
+// Use 3 clusters for simpler hierarchy with minimal structure.
+// Useful when you only need major section divisions (Main, Subsection, Detail).
+const hierarchyConfigMinimal: HierarchyConfig = {
+  enabled: true,
+  kClusters: 3, // Minimal clustering: just 3 levels
+  includeBbox: true,
+  ocrCoverageThreshold: undefined,
+};
+
+const pdfConfigMinimal: PdfConfig = {
+  hierarchy: hierarchyConfigMinimal,
+};
+
+const _extractionConfigMinimal: ExtractionConfig = {
+  pdfOptions: pdfConfigMinimal,
+};
+
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Example 3: With OCR coverage threshold
+// Trigger OCR if less than 50% of text has font data.
+// Useful for documents with mixed digital and scanned content.
+const hierarchyConfigOcr: HierarchyConfig = {
+  enabled: true,
+  kClusters: 6,
+  includeBbox: true,
+  ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
+};
+
+const pdfConfigOcr: PdfConfig = {
+  hierarchy: hierarchyConfigOcr,
+};
+
+const _extractionConfigOcr: ExtractionConfig = {
+  pdfOptions: pdfConfigOcr,
+};
+
+// const result = await kreuzberg.extractFile("document.pdf");
+
+// Integration with Kreuzberg WASM instance
+async function _extractWithHierarchy(): Promise<void> {
+  const config = extractionConfigBasic;
+  const kreuzberg = new Kreuzberg(config);
+
+  try {
+    // Extract from file (requires file input or fetch)
+    const result = await kreuzberg.extractFile("document.pdf");
+    console.log("Extraction complete:", result);
+  } catch (error) {
+    console.error("Extraction failed:", error);
+  }
+}
+
+// Field descriptions:
+//
+// enabled: boolean (default: true)
+//   - Enable or disable hierarchy extraction
+//   - When false, hierarchy structure is not analyzed
+//
+// kClusters: number (default: 6, valid: 1-7)
+//   - Number of font size clusters for hierarchy levels
+//   - 6 provides H1-H6 heading levels with body text
+//   - Higher values create more fine-grained hierarchy
+//   - Lower values create simpler structure
+//
+// includeBbox: boolean (default: true)
+//   - Include bounding box coordinates in hierarchy blocks
+//   - Required for spatial layout awareness and document structure
+//   - Set to false only if space optimization is critical
+//
+// ocrCoverageThreshold: number | undefined (default: undefined)
+//   - Range: 0.0 to 1.0
+//   - Triggers OCR when text block coverage falls below this fraction
+//   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+//   - undefined means no OCR coverage-based triggering
+//
--- a/docs/snippets/wasm/config/html_output.md
+++ b/docs/snippets/wasm/config/html_output.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  output_format: "html",
+  html_output: {
+    theme: "github",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(result.content); // HTML with kb-* classes
+```
--- a/docs/snippets/wasm/config/image-config.ts
+++ b/docs/snippets/wasm/config/image-config.ts
@@ -0,0 +1,31 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractImagesWithConfig() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    images: {
+      extractImages: true,
+      targetDpi: 300,
+      maxDimension: 2048,
+      preserveAspectRatio: true,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  if (result.images) {
+    console.log(`Extracted ${result.images.length} images`);
+
+    result.images.forEach((image) => {
+      console.log(
+        `Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
+      );
+    });
+  }
+}
+
+extractImagesWithConfig().catch(console.error);
--- a/docs/snippets/wasm/config/keyword_config.ts
+++ b/docs/snippets/wasm/config/keyword_config.ts
@@ -0,0 +1,65 @@
+import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
+
+// Example 1: Basic YAKE configuration
+// Uses YAKE algorithm with default parameters and English stopword filtering
+async function basicYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 10,
+      minScore: 0.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 2: Advanced YAKE with custom parameters
+// Fine-tunes YAKE with custom window size for co-occurrence analysis
+async function _advancedYake(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "yake",
+      maxKeywords: 15,
+      minScore: 0.1,
+      ngramRange: [1, 2],
+      language: "en",
+      yakeParams: {
+        windowSize: 1,
+      },
+      rakeParams: null,
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+// Example 3: RAKE configuration
+// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+async function _rakeConfig(): Promise<void> {
+  const config: ExtractionConfig = {
+    keywords: {
+      algorithm: "rake",
+      maxKeywords: 10,
+      minScore: 5.0,
+      ngramRange: [1, 3],
+      language: "en",
+      yakeParams: null,
+      rakeParams: {
+        minWordLength: 1,
+        maxWordsPerPhrase: 3,
+      },
+    },
+  };
+
+  const result = await extractFile("document.pdf", null, config);
+  console.log("Keywords:", result.keywords);
+}
+
+basicYake();
--- a/docs/snippets/wasm/config/keyword_extraction_config.md
+++ b/docs/snippets/wasm/config/keyword_extraction_config.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  keywords: {
+    algorithm: "yake",
+    max_keywords: 10,
+    min_score: 0.1,
+    ngram_range: [1, 3],
+    language: "en",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
+```
--- a/docs/snippets/wasm/config/language_detection_config.md
+++ b/docs/snippets/wasm/config/language_detection_config.md
@@ -0,0 +1,19 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  language_detection: {
+    enabled: true,
+    min_confidence: 0.8,
+    detect_multiple: true,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Detected language: ${result.language}`);
+console.log(`Confidence: ${result.language_confidence}`);
+```
--- a/docs/snippets/wasm/config/ocr-config.ts
+++ b/docs/snippets/wasm/config/ocr-config.ts
@@ -0,0 +1,26 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithOcr() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "eng",
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  console.log("Extracted text from scanned document:");
+  console.log(result.content);
+
+  if (result.detectedLanguages) {
+    console.log("Detected languages:", result.detectedLanguages);
+  }
+}
+
+extractWithOcr().catch(console.error);
--- a/docs/snippets/wasm/config/ocr_dpi_config.md
+++ b/docs/snippets/wasm/config/ocr_dpi_config.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  images: {
+    extract_images: true,
+    target_dpi: 300,
+    max_image_dimension: 4096,
+    auto_adjust_dpi: true,
+    min_dpi: 150,
+    max_dpi: 600,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Extracted images: ${result.images?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/pdf_config.md
+++ b/docs/snippets/wasm/config/pdf_config.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  pdf_options: {
+    extract_images: true,
+    passwords: ["password123"],
+    extract_metadata: true,
+    hierarchy: {},
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Title: ${result.metadata?.title}`);
+console.log(`Authors: ${result.metadata?.authors}`);
+```
--- a/docs/snippets/wasm/config/pdf_hierarchy_config.md
+++ b/docs/snippets/wasm/config/pdf_hierarchy_config.md
@@ -0,0 +1,22 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  pdf_options: {
+    hierarchy: {
+      enabled: true,
+      detection_threshold: 0.75,
+      ocr_coverage_threshold: 0.8,
+      min_level: 1,
+      max_level: 5,
+    },
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
+```
--- a/docs/snippets/wasm/config/postprocessor_config.md
+++ b/docs/snippets/wasm/config/postprocessor_config.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  postprocessor: {
+    enabled: true,
+    enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Processed content: ${result.content}`);
+```
--- a/docs/snippets/wasm/config/quality_processing_config.md
+++ b/docs/snippets/wasm/config/quality_processing_config.md
@@ -0,0 +1,16 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  enable_quality_processing: true,
+  use_cache: true,
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Quality score: ${result.quality_score}`);
+console.log(`Processing time: ${result.processing_time}`);
+```
--- a/docs/snippets/wasm/config/tesseract_config.md
+++ b/docs/snippets/wasm/config/tesseract_config.md
@@ -0,0 +1,21 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseract_config: {
+      psm: 6,
+      oem: 3,
+    },
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`OCR text: ${result.content}`);
+```
--- a/docs/snippets/wasm/config/token_reduction_config.md
+++ b/docs/snippets/wasm/config/token_reduction_config.md
@@ -0,0 +1,18 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+const config = {
+  token_reduction: {
+    mode: "moderate",
+    preserve_important_words: true,
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(`Original tokens: ${result.token_count}`);
+console.log(`Reduced content: ${result.content}`);
+```
--- a/docs/snippets/wasm/getting-started/async-extraction.ts
+++ b/docs/snippets/wasm/getting-started/async-extraction.ts
@@ -0,0 +1,26 @@
+import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
+
+async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
+  const caps = getWasmCapabilities();
+  if (!caps.hasWasm) {
+    throw new Error("WebAssembly not supported");
+  }
+
+  await initWasm();
+
+  const results = await Promise.all(
+    files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])),
+  );
+
+  return results.map((r) => ({
+    content: r.content,
+    pageCount: r.metadata?.pageCount,
+  }));
+}
+
+const fileBytes = [new Uint8Array([1, 2, 3])];
+const mimes = ["application/pdf"];
+
+extractDocuments(fileBytes, mimes)
+  .then((results) => console.log(results))
+  .catch(console.error);
--- a/docs/snippets/wasm/getting-started/basic-extract.ts
+++ b/docs/snippets/wasm/getting-started/basic-extract.ts
@@ -0,0 +1,17 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function main() {
+  await initWasm();
+
+  const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
+  const bytes = new Uint8Array(buffer);
+
+  const result = await extractBytes(bytes, "application/pdf");
+
+  console.log("Extracted content:");
+  console.log(result.content);
+  console.log("MIME type:", result.mimeType);
+  console.log("Metadata:", result.metadata);
+}
+
+main().catch(console.error);
--- a/docs/snippets/wasm/getting-started/basic_usage.md
+++ b/docs/snippets/wasm/getting-started/basic_usage.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(result.content);
+console.log(`MIME Type: ${result.mime_type}`);
+```
--- a/docs/snippets/wasm/getting-started/batch-processing.ts
+++ b/docs/snippets/wasm/getting-started/batch-processing.ts
@@ -0,0 +1,33 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface DocumentJob {
+  name: string;
+  bytes: Uint8Array;
+  mimeType: string;
+}
+
+async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
+  await initWasm();
+
+  const results: Record<string, string> = {};
+  const queue = [...documents];
+
+  const workers = Array(concurrency)
+    .fill(null)
+    .map(async () => {
+      while (queue.length > 0) {
+        const doc = queue.shift();
+        if (!doc) break;
+
+        try {
+          const result = await extractBytes(doc.bytes, doc.mimeType);
+          results[doc.name] = result.content;
+        } catch (error) {
+          console.error(`Failed to process ${doc.name}:`, error);
+        }
+      }
+    });
+
+  await Promise.all(workers);
+  return results;
+}
--- a/docs/snippets/wasm/getting-started/batch_extract_files_sync.md
+++ b/docs/snippets/wasm/getting-started/batch_extract_files_sync.md
@@ -0,0 +1,14 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const fileInputs = document.getElementById("files") as HTMLInputElement;
+const files = Array.from(fileInputs.files || []);
+
+const results = await Promise.all(files.map((file) => extractFromFile(file)));
+
+results.forEach((result, i) => {
+  console.log(`File ${i + 1}: ${result.content.length} characters`);
+});
+```
--- a/docs/snippets/wasm/getting-started/browser-file-input.ts
+++ b/docs/snippets/wasm/getting-started/browser-file-input.ts
@@ -0,0 +1,29 @@
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+async function setupFileInput() {
+  await initWasm();
+
+  const fileInput = document.getElementById("file-input") as HTMLInputElement;
+
+  fileInput.addEventListener("change", async (event) => {
+    const file = (event.target as HTMLInputElement).files?.[0];
+    if (!file) return;
+
+    try {
+      const result = await extractFromFile(file);
+      console.log("Extracted text:", result.content);
+      displayResults(result);
+    } catch (error) {
+      console.error("Extraction failed:", error);
+    }
+  });
+}
+
+function displayResults(result: any) {
+  const output = document.getElementById("output");
+  if (output) {
+    output.textContent = `${result.content.substring(0, 500)}...`;
+  }
+}
+
+setupFileInput().catch(console.error);
--- a/docs/snippets/wasm/getting-started/extract_bytes_sync.md
+++ b/docs/snippets/wasm/getting-started/extract_bytes_sync.md
@@ -0,0 +1,12 @@
+```typescript title="WASM"
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const response = await fetch("document.pdf");
+const buffer = await response.arrayBuffer();
+const data = new Uint8Array(buffer);
+
+const result = await extractBytes(data, "application/pdf");
+console.log(result.content);
+```
--- a/docs/snippets/wasm/getting-started/extract_file.md
+++ b/docs/snippets/wasm/getting-started/extract_file.md
@@ -0,0 +1,10 @@
+```typescript title="WASM"
+import init, { extractFile } from "kreuzberg-wasm";
+
+await init();
+
+const result = await extractFile("document.pdf", undefined, undefined);
+console.log(`Extracted content: ${result.content}`);
+console.log(`Tables found: ${result.tables?.length ?? 0}`);
+console.log(`Format: ${result.metadata?.format ?? "unknown"}`);
+```
--- a/docs/snippets/wasm/getting-started/extract_file_async.md
+++ b/docs/snippets/wasm/getting-started/extract_file_async.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file);
+  const content = result.content;
+  const tableCount = result.tables.length;
+
+  console.log(`Content length: ${content.length} characters`);
+  console.log(`Tables: ${tableCount}`);
+}
+```
--- a/docs/snippets/wasm/getting-started/extract_file_sync.md
+++ b/docs/snippets/wasm/getting-started/extract_file_sync.md
@@ -0,0 +1,15 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file);
+  console.log(result.content);
+  console.log(`Tables: ${result.tables.length}`);
+  console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+}
+```
--- a/docs/snippets/wasm/getting-started/extract_with_ocr.md
+++ b/docs/snippets/wasm/getting-started/extract_with_ocr.md
@@ -0,0 +1,20 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("scanned.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const config = {
+  force_ocr: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+};
+
+const result = await extractBytes(data, "application/pdf", config);
+console.log(result.content);
+console.log(`Detected languages: ${result.detected_languages?.join(", ") ?? "unknown"}`);
+```
--- a/docs/snippets/wasm/getting-started/hello_world.md
+++ b/docs/snippets/wasm/getting-started/hello_world.md
@@ -0,0 +1,9 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const data = new Uint8Array([0x25, 0x50, 0x44, 0x46]); // PDF magic bytes
+const result = await extractBytes(data, "application/pdf", undefined);
+console.log(result.content);
+```
--- a/docs/snippets/wasm/getting-started/initialization.ts
+++ b/docs/snippets/wasm/getting-started/initialization.ts
@@ -0,0 +1,25 @@
+import { getVersion, getWasmCapabilities, initWasm, isInitialized } from "@kreuzberg/wasm";
+
+async function initializeKreuzberg() {
+  const caps = getWasmCapabilities();
+
+  if (!caps.hasWasm) {
+    console.error("WebAssembly not supported");
+    return;
+  }
+
+  try {
+    if (!isInitialized()) {
+      await initWasm();
+    }
+
+    const version = getVersion();
+    console.log(`Kreuzberg ${version} initialized successfully`);
+    console.log("Workers available:", caps.hasWorkers);
+    console.log("SharedArrayBuffer available:", caps.hasSharedArrayBuffer);
+  } catch (error) {
+    console.error("Initialization failed:", error);
+  }
+}
+
+initializeKreuzberg();
--- a/docs/snippets/wasm/getting-started/install_verify.md
+++ b/docs/snippets/wasm/getting-started/install_verify.md
@@ -0,0 +1,6 @@
+```typescript title="WASM"
+import init from "kreuzberg-wasm";
+
+await init();
+console.log("Kreuzberg WASM loaded successfully");
+```
--- a/docs/snippets/wasm/getting-started/read_content.md
+++ b/docs/snippets/wasm/getting-started/read_content.md
@@ -0,0 +1,26 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const response = await fetch("document.pdf");
+const data = new Uint8Array(await response.arrayBuffer());
+
+const result = await extractBytes(data, "application/pdf", undefined);
+
+console.log(`Content: ${result.content}`);
+console.log(`Success: true`);
+console.log(`Content length: ${result.content.length} characters`);
+
+if (result.tables && result.tables.length > 0) {
+  result.tables.forEach((table, i) => {
+    console.log(`Table ${i}: ${table.rows?.length ?? 0} rows`);
+  });
+}
+
+if (result.chunks && result.chunks.length > 0) {
+  result.chunks.forEach((chunk, i) => {
+    console.log(`Chunk ${i}: ${chunk.text?.length ?? 0} characters`);
+  });
+}
+```
--- a/docs/snippets/wasm/getting-started/runtime-detection.ts
+++ b/docs/snippets/wasm/getting-started/runtime-detection.ts
@@ -0,0 +1,32 @@
+import {
+  detectRuntime,
+  getWasmCapabilities,
+  initWasm,
+  isBrowser,
+  isBun,
+  isDeno,
+  isNode,
+} from "@kreuzberg/wasm";
+
+async function setupForRuntime() {
+  const runtime = detectRuntime();
+  const caps = getWasmCapabilities();
+
+  console.log(`Running in ${runtime} environment`);
+  console.log(`Workers: ${caps.hasWorkers}`);
+  console.log(`SharedArrayBuffer: ${caps.hasSharedArrayBuffer}`);
+
+  if (isBrowser()) {
+    console.log("Browser features available");
+  } else if (isNode()) {
+    console.log("Node.js features available");
+  } else if (isDeno()) {
+    console.log("Deno features available");
+  } else if (isBun()) {
+    console.log("Bun features available");
+  }
+
+  await initWasm();
+}
+
+setupForRuntime().catch(console.error);
--- a/docs/snippets/wasm/llm/structured_extraction.md
+++ b/docs/snippets/wasm/llm/structured_extraction.md
@@ -0,0 +1,29 @@
+```typescript title="WASM"
+import init, { extractFile } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  structuredExtraction: {
+    schema: {
+      type: "object",
+      properties: {
+        title: { type: "string" },
+        authors: { type: "array", items: { type: "string" } },
+        date: { type: "string" },
+      },
+      required: ["title", "authors", "date"],
+      additionalProperties: false,
+    },
+    llm: {
+      model: "openai/gpt-4o-mini",
+    },
+    strict: true,
+  },
+};
+
+const result = await extractFile("paper.pdf", undefined, config);
+console.log(result.structuredOutput);
+```
+
+<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment. The WASM crate accepts `structuredExtraction` configuration; the LLM call is dispatched through liter-llm's `wasm-http` transport.
--- a/docs/snippets/wasm/mcp/mcp_custom_client.md
+++ b/docs/snippets/wasm/mcp/mcp_custom_client.md
@@ -0,0 +1,40 @@
+```typescript title="WASM"
+// The WASM crate has no MCP client. To integrate with an MCP server,
+// drive the kreuzberg CLI from a Node.js host that uses kreuzberg-wasm
+// for in-process extraction.
+import { spawn } from "node:child_process";
+import * as readline from "node:readline";
+
+const mcpProcess = spawn("kreuzberg", ["mcp"]);
+
+const rl = readline.createInterface({
+  input: mcpProcess.stdout,
+  output: mcpProcess.stdin,
+  terminal: false,
+});
+
+const request = {
+  method: "tools/call",
+  params: {
+    name: "extract_file",
+    arguments: {
+      path: "document.pdf",
+      async: true,
+    },
+  },
+};
+
+mcpProcess.stdin.write(`${JSON.stringify(request)}\n`);
+
+rl.on("line", (line) => {
+  const response = JSON.parse(line);
+  console.log(response);
+  mcpProcess.kill();
+});
+
+mcpProcess.on("error", (err) => {
+  console.error("Failed to start MCP process:", err);
+});
+```
+
+<!-- snippet:syntax-only --> MCP transport is not exported by the WASM crate; this snippet drives the MCP CLI from the same Node host that loads kreuzberg-wasm.
--- a/docs/snippets/wasm/mcp/mcp_server_start.md
+++ b/docs/snippets/wasm/mcp/mcp_server_start.md
@@ -0,0 +1,22 @@
+```typescript title="WASM"
+// MCP server is provided by the kreuzberg CLI (Rust binary). The WASM build
+// targets browser/Node.js extraction and does not embed a server process.
+// Spawn the CLI from a Node.js host that consumes the WASM module separately.
+import { spawn } from "node:child_process";
+
+const mcpProcess = spawn("kreuzberg", ["mcp"]);
+
+mcpProcess.stdout.on("data", (data) => {
+  console.log(`MCP Server: ${data}`);
+});
+
+mcpProcess.stderr.on("data", (data) => {
+  console.error(`MCP Error: ${data}`);
+});
+
+mcpProcess.on("error", (err) => {
+  console.error(`Failed to start MCP server: ${err.message}`);
+});
+```
+
+<!-- snippet:syntax-only --> The MCP server is a CLI feature; the WASM crate does not export an MCP server entry point. This snippet shows how a Node host that uses kreuzberg-wasm for extraction can also drive the standalone MCP CLI.
--- a/docs/snippets/wasm/metadata/extract-metadata.ts
+++ b/docs/snippets/wasm/metadata/extract-metadata.ts
@@ -0,0 +1,25 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function getDocumentMetadata() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const result = await extractBytes(bytes, "application/pdf");
+
+  const metadata = result.metadata;
+
+  console.log("Document Metadata:");
+  console.log("Title:", metadata.title);
+  console.log("Author:", metadata.author);
+  console.log("Creator:", metadata.creator);
+  console.log("Subject:", metadata.subject);
+  console.log("Keywords:", metadata.keywords);
+  console.log("Pages:", metadata.pageCount);
+  console.log("Created:", metadata.createdAt);
+  console.log("Modified:", metadata.modifiedAt);
+
+  return metadata;
+}
+
+getDocumentMetadata().catch(console.error);
--- a/docs/snippets/wasm/metadata/filter-metadata.ts
+++ b/docs/snippets/wasm/metadata/filter-metadata.ts
@@ -0,0 +1,35 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface DocumentSummary {
+  fileName: string;
+  title: string | undefined;
+  author: string | undefined;
+  pageCount: number | undefined;
+  language: string;
+}
+
+async function filterAndSummarizeMetadata(files: string[]): Promise<DocumentSummary[]> {
+  await initWasm();
+
+  const summaries: DocumentSummary[] = [];
+
+  for (const fileName of files) {
+    const bytes = new Uint8Array(await fetch(fileName).then((r) => r.arrayBuffer()));
+
+    const result = await extractBytes(bytes, "application/pdf");
+
+    summaries.push({
+      fileName,
+      title: result.metadata.title,
+      author: result.metadata.author,
+      pageCount: result.metadata.pageCount,
+      language: result.detectedLanguages?.[0] ?? "unknown",
+    });
+  }
+
+  return summaries;
+}
+
+filterAndSummarizeMetadata(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
+  .then((summaries) => console.table(summaries))
+  .catch(console.error);
--- a/docs/snippets/wasm/metadata/image-metadata.ts
+++ b/docs/snippets/wasm/metadata/image-metadata.ts
@@ -0,0 +1,34 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractImageMetadata() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    images: {
+      extractImages: true,
+      targetDpi: 150,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  if (result.images) {
+    result.images.forEach((image, index) => {
+      console.log(`Image ${index}:`, {
+        format: image.format,
+        width: image.width,
+        height: image.height,
+        pageNumber: image.pageNumber,
+        colorspace: image.colorspace,
+        bitsPerComponent: image.bitsPerComponent,
+        isMask: image.isMask,
+        dataSize: image.data.byteLength,
+      });
+    });
+  }
+}
+
+extractImageMetadata().catch(console.error);
--- a/docs/snippets/wasm/metadata/language_detection.md
+++ b/docs/snippets/wasm/metadata/language_detection.md
@@ -0,0 +1,26 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
+
+await init();
+
+const fileBuffer = new Uint8Array(/* your file bytes */);
+const mimeType = "text/plain";
+
+const config = new ExtractionConfig({
+  language_detection: new LanguageDetectionConfig({
+    enable_detection: true,
+    target_languages: ["en", "de", "fr"],
+  }),
+});
+
+const result = await extractBytes(fileBuffer, mimeType, config);
+
+if (result.detected_languages) {
+  console.log("Detected languages:", result.detected_languages);
+
+  for (const language of result.detected_languages) {
+    console.log(`Language: ${language}`);
+  }
+}
+```
--- a/docs/snippets/wasm/metadata/language_detection_multilingual.md
+++ b/docs/snippets/wasm/metadata/language_detection_multilingual.md
@@ -0,0 +1,35 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
+
+await init();
+
+const fileBuffer = new Uint8Array(/* your file bytes */);
+const mimeType = "text/plain";
+
+const config = new ExtractionConfig({
+  language_detection: new LanguageDetectionConfig({
+    enable_detection: true,
+    target_languages: ["en", "de", "fr", "es", "it", "ja", "zh"],
+    confidence_threshold: 0.5,
+  }),
+});
+
+const result = await extractBytes(fileBuffer, mimeType, config);
+
+if (result.detected_languages && result.detected_languages.length > 0) {
+  console.log("Document languages:", result.detected_languages.join(", "));
+
+  // Process multi-language content
+  result.detected_languages.forEach((lang) => {
+    console.log(`Language detected: ${lang}`);
+  });
+
+  // Access metadata for language info
+  if (result.metadata && result.metadata.language) {
+    console.log(`Primary metadata language: ${result.metadata.language}`);
+  }
+} else {
+  console.log("No languages detected");
+}
+```
--- a/docs/snippets/wasm/metadata/metadata-with-chunks.ts
+++ b/docs/snippets/wasm/metadata/metadata-with-chunks.ts
@@ -0,0 +1,33 @@
+import type { ExtractionConfig } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithChunkMetadata() {
+  await initWasm();
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const config: ExtractionConfig = {
+    chunking: {
+      maxChars: 500,
+      chunkOverlap: 50,
+    },
+  };
+
+  const result = await extractBytes(bytes, "application/pdf", config);
+
+  console.log("Document Metadata:", result.metadata);
+
+  if (result.chunks) {
+    result.chunks.forEach((chunk) => {
+      console.log("Chunk Metadata:", {
+        charStart: chunk.metadata.charStart,
+        charEnd: chunk.metadata.charEnd,
+        index: chunk.metadata.chunkIndex,
+        total: chunk.metadata.totalChunks,
+        tokens: chunk.metadata.tokenCount,
+      });
+    });
+  }
+}
+
+extractWithChunkMetadata().catch(console.error);
--- a/docs/snippets/wasm/metadata/metadata.md
+++ b/docs/snippets/wasm/metadata/metadata.md
@@ -0,0 +1,85 @@
+```typescript title="WASM"
+import { extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file);
+  console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
+
+  // Access common metadata fields
+  if (result.metadata.title) {
+    console.log(`Title: ${result.metadata.title}`);
+  }
+
+  // Access format-specific metadata
+  const metadata = result.metadata;
+
+  // For HTML files
+  if (metadata.html) {
+    const htmlMeta = metadata.html;
+    console.log(`HTML Title: ${htmlMeta.title}`);
+    console.log(`Description: ${htmlMeta.description}`);
+
+    // Access keywords as array
+    if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
+      console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
+    }
+
+    // Access canonical URL
+    if (htmlMeta.canonical_url) {
+      console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
+    }
+
+    // Access Open Graph fields
+    if (htmlMeta.open_graph) {
+      if (htmlMeta.open_graph["title"]) {
+        console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
+      }
+      if (htmlMeta.open_graph["image"]) {
+        console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
+      }
+    }
+
+    // Access Twitter Card fields
+    if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
+      console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
+    }
+
+    // Access headers
+    if (htmlMeta.headers && htmlMeta.headers.length > 0) {
+      console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
+    }
+
+    // Access links
+    if (htmlMeta.links && htmlMeta.links.length > 0) {
+      htmlMeta.links.forEach((link: any) => {
+        console.log(`Link: ${link.href} (${link.text})`);
+      });
+    }
+
+    // Access images
+    if (htmlMeta.images && htmlMeta.images.length > 0) {
+      htmlMeta.images.forEach((image: any) => {
+        console.log(`Image: ${image.src}`);
+      });
+    }
+
+    // Access structured data
+    if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
+      console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
+    }
+  }
+
+  // PDF-specific fields are at the top level of metadata
+  if (metadata.pageCount) {
+    console.log(`Pages: ${metadata.pageCount}`);
+  }
+  if (metadata.authors && metadata.authors.length > 0) {
+    console.log(`Authors: ${metadata.authors.join(", ")}`);
+  }
+}
+```
--- a/docs/snippets/wasm/metadata/page_boundaries.md
+++ b/docs/snippets/wasm/metadata/page_boundaries.md
@@ -0,0 +1,36 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
+
+await init();
+
+const fileBuffer = new Uint8Array(/* your file bytes */);
+const mimeType = "application/pdf";
+
+const config = new ExtractionConfig({
+  pages: new PageConfig({
+    extract_pages: true,
+  }),
+});
+
+const result = await extractBytes(fileBuffer, mimeType, config);
+
+if (result.metadata && result.metadata.pages) {
+  const pageStructure = result.metadata.pages;
+  console.log(`Total pages: ${pageStructure.total_count}`);
+
+  if (pageStructure.boundaries) {
+    // Iterate through page boundaries to map content to pages
+    pageStructure.boundaries.forEach((boundary) => {
+      const pageText = result.content.substring(
+        boundary.byte_start,
+        Math.min(boundary.byte_end, boundary.byte_start + 100),
+      );
+
+      console.log(`Page ${boundary.page_number}:`);
+      console.log(`  Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
+      console.log(`  Preview: ${pageText}...`);
+    });
+  }
+}
+```
--- a/docs/snippets/wasm/metadata/page_tracking_basic.md
+++ b/docs/snippets/wasm/metadata/page_tracking_basic.md
@@ -0,0 +1,38 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
+
+await init();
+
+const fileBuffer = new Uint8Array(/* your file bytes */);
+const mimeType = "application/pdf";
+
+const config = new ExtractionConfig({
+  pages: new PageConfig({
+    extract_pages: true,
+  }),
+});
+
+const result = await extractBytes(fileBuffer, mimeType, config);
+
+if (result.pages) {
+  console.log(`Total pages extracted: ${result.pages.length}`);
+
+  result.pages.forEach((page) => {
+    console.log(`Page ${page.pageNumber}:`);
+    console.log(`  Content length: ${page.content.length} chars`);
+    console.log(`  Tables: ${page.tables.length}`);
+    console.log(`  Images: ${page.images.length}`);
+
+    // Check if page is blank
+    if (page.isBlank) {
+      console.log("  This page is blank");
+    }
+
+    // Access page hierarchy if available
+    if (page.hierarchy) {
+      console.log(`  Hierarchy level: ${page.hierarchy}`);
+    }
+  });
+}
+```
--- a/docs/snippets/wasm/metadata/tables.md
+++ b/docs/snippets/wasm/metadata/tables.md
@@ -0,0 +1,43 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+import { ExtractionConfig } from "kreuzberg-wasm";
+
+await init();
+
+const fileBuffer = new Uint8Array(/* your file bytes */);
+const mimeType = "application/pdf";
+
+const config = new ExtractionConfig({});
+
+const result = await extractBytes(fileBuffer, mimeType, config);
+
+if (result.tables && result.tables.length > 0) {
+  console.log(`Found ${result.tables.length} tables`);
+
+  result.tables.forEach((table, index) => {
+    console.log(`\nTable ${index + 1}:`);
+    console.log(`  Page: ${table.pageNumber}`);
+    console.log(`  Markdown representation:`);
+    console.log(table.markdown);
+
+    // Access cell data
+    const cells = table.cells;
+    if (cells) {
+      console.log(`  Total cells: ${Object.keys(cells).length}`);
+
+      // Iterate through cells (structure depends on how cells are serialized)
+      for (const rowKey of Object.keys(cells)) {
+        const row = cells[rowKey];
+        console.log(`  Row ${rowKey}: ${JSON.stringify(row)}`);
+      }
+    }
+
+    // Access bounding box if available
+    if (table.boundingBox) {
+      console.log(`  Bounding box: ${table.boundingBox}`);
+    }
+  });
+} else {
+  console.log("No tables found in document");
+}
+```
--- a/docs/snippets/wasm/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/wasm/ocr/cloud_ocr_backend.md
@@ -0,0 +1,22 @@
+<!-- snippet:syntax-only -->
+
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Cloud OCR backends are not supported in WASM directly.
+// WASM runs in a sandboxed environment without direct network access.
+// To use cloud OCR services, implement a wrapper on your server
+// or use a cloud platform with built-in OCR integration.
+
+const cloudOcrConfig = {
+  ocr: {
+    backend: "custom", // Custom backends must be registered via native runtime
+    language: "eng",
+  },
+};
+
+// This example shows the configuration structure.
+// In production, route cloud OCR requests through your backend service.
+```
--- a/docs/snippets/wasm/ocr/enable-ocr.ts
+++ b/docs/snippets/wasm/ocr/enable-ocr.ts
@@ -0,0 +1,27 @@
+import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithOcr() {
+  await initWasm();
+
+  try {
+    await enableOcr();
+    console.log("OCR enabled successfully");
+  } catch (error) {
+    console.error("Failed to enable OCR:", error);
+    return;
+  }
+
+  const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
+
+  const result = await extractBytes(bytes, "image/png", {
+    ocr: {
+      backend: "tesseract-wasm",
+      language: "eng",
+    },
+  });
+
+  console.log("Extracted text:");
+  console.log(result.content);
+}
+
+extractWithOcr().catch(console.error);
--- a/docs/snippets/wasm/ocr/image_extraction.md
+++ b/docs/snippets/wasm/ocr/image_extraction.md
@@ -0,0 +1,15 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const imageData = await fetch("document.pdf").then((res) => res.arrayBuffer());
+
+const result = await extractBytes(imageData, "application/pdf", {
+  images: {
+    extract_images: true,
+  },
+});
+
+console.log(result.images);
+```
--- a/docs/snippets/wasm/ocr/image_preprocessing.md
+++ b/docs/snippets/wasm/ocr/image_preprocessing.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
+
+const result = await extractBytes(documentData, "application/pdf", {
+  images: {
+    extract_images: true,
+    target_dpi: 300,
+    max_image_dimension: 2000,
+  },
+});
+
+console.log(result.content);
+```
--- a/docs/snippets/wasm/ocr/multi-language-ocr.ts
+++ b/docs/snippets/wasm/ocr/multi-language-ocr.ts
@@ -0,0 +1,29 @@
+import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractMultilingualDocument() {
+  await initWasm();
+  await enableOcr();
+
+  const documents = [
+    { name: "english.png", lang: "eng" },
+    { name: "german.png", lang: "deu" },
+    { name: "spanish.png", lang: "spa" },
+  ];
+
+  for (const doc of documents) {
+    const bytes = new Uint8Array(await fetch(doc.name).then((r) => r.arrayBuffer()));
+
+    const result = await extractBytes(bytes, "image/png", {
+      ocr: {
+        backend: "tesseract-wasm",
+        language: doc.lang,
+      },
+    });
+
+    console.log(`${doc.name} (${doc.lang}):`);
+    console.log(result.content);
+    console.log("---");
+  }
+}
+
+extractMultilingualDocument().catch(console.error);
--- a/docs/snippets/wasm/ocr/ocr-backend-registration.ts
+++ b/docs/snippets/wasm/ocr/ocr-backend-registration.ts
@@ -0,0 +1,30 @@
+import {
+  initWasm,
+  listOcrBackends,
+  registerOcrBackend,
+  TesseractWasmBackend,
+  unregisterOcrBackend,
+} from "@kreuzberg/wasm";
+
+async function manageOcrBackends() {
+  await initWasm();
+
+  const backend = new TesseractWasmBackend();
+  await backend.initialize();
+
+  registerOcrBackend(backend);
+
+  const backends = listOcrBackends();
+  console.log("Available OCR backends:", backends);
+
+  if (backends.includes("tesseract-wasm")) {
+    console.log("Tesseract WASM backend is registered");
+  }
+
+  unregisterOcrBackend("tesseract-wasm");
+
+  const afterUnregister = listOcrBackends();
+  console.log("Backends after unregister:", afterUnregister);
+}
+
+manageOcrBackends().catch(console.error);
--- a/docs/snippets/wasm/ocr/ocr-error-handling.ts
+++ b/docs/snippets/wasm/ocr/ocr-error-handling.ts
@@ -0,0 +1,37 @@
+import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithErrorHandling() {
+  try {
+    await initWasm();
+  } catch (error) {
+    console.error("Failed to initialize WASM:", error);
+    return;
+  }
+
+  try {
+    await enableOcr();
+  } catch (error) {
+    if (error instanceof Error && error.message.includes("browser")) {
+      console.warn("OCR not available in this environment, proceeding without OCR");
+    } else {
+      throw error;
+    }
+  }
+
+  try {
+    const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
+
+    const result = await extractBytes(bytes, "image/png", {
+      ocr: {
+        backend: "tesseract-wasm",
+        language: "eng",
+      },
+    });
+
+    console.log("Extraction successful:", result.content.length, "chars");
+  } catch (error) {
+    console.error("Extraction failed:", error);
+  }
+}
+
+extractWithErrorHandling().catch(console.error);
--- a/docs/snippets/wasm/ocr/ocr_easyocr.md
+++ b/docs/snippets/wasm/ocr/ocr_easyocr.md
@@ -0,0 +1,20 @@
+<!-- snippet:syntax-only -->
+
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// EasyOCR backend is not supported in WASM.
+// EasyOCR requires PyTorch and Python runtime, which are unavailable in browser/WASM.
+// Use the Tesseract-WASM backend instead, or route requests through a backend service.
+
+const easyOcrConfig = {
+  ocr: {
+    backend: "easyocr", // Not supported in WASM
+    language: "en",
+  },
+};
+
+// This example shows the configuration structure for reference only.
+```
--- a/docs/snippets/wasm/ocr/ocr_elements.md
+++ b/docs/snippets/wasm/ocr/ocr_elements.md
@@ -0,0 +1,24 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const documentData = await fetch("scanned.pdf").then((res) => res.arrayBuffer());
+
+const result = await extractBytes(documentData, "application/pdf", {
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+    element_config: {
+      include_elements: true,
+    },
+  },
+});
+
+if (result.ocr_elements) {
+  for (const element of result.ocr_elements) {
+    console.log("Text:", element.text);
+    console.log("Confidence:", element.confidence);
+  }
+}
+```
--- a/docs/snippets/wasm/ocr/ocr_extraction.md
+++ b/docs/snippets/wasm/ocr/ocr_extraction.md
@@ -0,0 +1,34 @@
+```typescript title="WASM (Browser)"
+import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+await enableOcr();
+
+const fileInput = document.getElementById("file") as HTMLInputElement;
+const file = fileInput.files?.[0];
+
+if (file) {
+  const result = await extractFromFile(file, file.type, {
+    ocr: {
+      backend: "kreuzberg-tesseract",
+      language: "eng",
+    },
+  });
+  console.log(result.content);
+}
+```
+
+```typescript title="WASM (Node.js / Deno / Bun)"
+import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
+
+await initWasm();
+await enableOcr(); // Uses native kreuzberg-tesseract backend
+
+const result = await extractFile("./scanned_document.png", "image/png", {
+  ocr: {
+    backend: "kreuzberg-tesseract",
+    language: "eng",
+  },
+});
+console.log(result.content);
+```
--- a/docs/snippets/wasm/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/wasm/ocr/ocr_force_all_pages.md
@@ -0,0 +1,17 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
+
+const result = await extractBytes(documentData, "application/pdf", {
+  force_ocr: true,
+  ocr: {
+    backend: "tesseract",
+    language: "eng",
+  },
+});
+
+console.log(result.content);
+```
--- a/docs/snippets/wasm/ocr/ocr_multi_language.md
+++ b/docs/snippets/wasm/ocr/ocr_multi_language.md
@@ -0,0 +1,16 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const multilingualData = await fetch("multilingual.pdf").then((res) => res.arrayBuffer());
+
+const result = await extractBytes(multilingualData, "application/pdf", {
+  ocr: {
+    backend: "tesseract",
+    language: "eng+deu+fra",
+  },
+});
+
+console.log(result.content);
+```
--- a/Show More
+++ b/Show More