Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/advanced/chunk_page_mapping.md
+++ b/docs/snippets/wasm/advanced/chunk_page_mapping.md
@@ -0,0 +1,44 @@
+```typescript title="WASM"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 1500,
+    chunkOverlap: 300,
+  },
+  includeDocumentStructure: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Map chunks back to pages for source attribution
+const chunkPageMap = new Map<number, number[]>();
+
+result.chunks?.forEach((chunk, chunkIndex) => {
+  const firstPage = chunk.metadata?.firstPage;
+  const lastPage = chunk.metadata?.lastPage;
+
+  if (firstPage !== undefined && lastPage !== undefined) {
+    for (let page = firstPage; page <= lastPage; page++) {
+      if (!chunkPageMap.has(page)) {
+        chunkPageMap.set(page, []);
+      }
+      chunkPageMap.get(page)!.push(chunkIndex);
+    }
+  }
+});
+
+// Use the mapping for source attribution
+chunkPageMap.forEach((chunkIndices, pageNum) => {
+  console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
+  chunkIndices.forEach((idx) => {
+    const chunk = result.chunks![idx];
+    console.log(`  Content: "${chunk.content.substring(0, 60)}..."`);
+  });
+});
+```
+
+**Snippet:syntax-only** - Requires document structure parsing during extraction.
--- a/docs/snippets/wasm/advanced/chunking_config.md
+++ b/docs/snippets/wasm/advanced/chunking_config.md
@@ -0,0 +1,70 @@
+```typescript title="WASM - Fixed-Size Chunks"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 2000,
+    chunkOverlap: 400,
+    trim: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+result.chunks?.forEach((chunk, idx) => {
+  console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
+  console.log(`  Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
+  console.log(`  Content: "${chunk.content.substring(0, 50)}..."`);
+});
+```
+
+```typescript title="WASM - Markdown-Aware Chunking"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    chunkerType: "markdown",
+    maxChars: 1500,
+    prependHeadingContext: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+result.chunks?.forEach((chunk) => {
+  // Content already includes heading context prepended
+  console.log(chunk.content.substring(0, 80));
+  console.log(
+    `  Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
+  );
+});
+```
+
+```typescript title="WASM - Semantic Chunking with Topic Threshold"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    chunkerType: "semantic",
+    maxChars: 1000,
+    topicThreshold: 0.5, // Boundary detection at 50% topic change
+    chunkOverlap: 100,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+console.log(`Generated ${result.chunks?.length} semantic chunks`);
+result.chunks?.forEach((chunk) => {
+  console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
+});
+```
--- a/docs/snippets/wasm/advanced/chunking_rag.md
+++ b/docs/snippets/wasm/advanced/chunking_rag.md
@@ -0,0 +1,80 @@
+```typescript title="WASM - Chunking for RAG Pipeline"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 512, // Smaller chunks for vector DB efficiency
+    chunkOverlap: 100,
+    trim: true,
+  },
+  includeDocumentStructure: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Prepare chunks for vector database ingestion
+const ragChunks =
+  result.chunks?.map((chunk, idx) => ({
+    id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
+    text: chunk.content,
+    metadata: {
+      source: result.metadata?.filename,
+      chunk_index: chunk.metadata?.chunkIndex,
+      total_chunks: chunk.metadata?.totalChunks,
+      first_page: chunk.metadata?.firstPage,
+      last_page: chunk.metadata?.lastPage,
+      byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
+    },
+    // Embedding would be added by vector DB embedding model
+  })) || [];
+
+console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
+ragChunks.slice(0, 3).forEach((c) => {
+  console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
+});
+
+// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
+// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
+//   id: c.id,
+//   values: await embedModel.embed(c.text),
+//   metadata: c.metadata,
+// })));
+```
+
+```typescript title="WASM - RAG with Retrieval Context"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  chunking: {
+    maxChars: 768,
+    chunkOverlap: 200,
+    prependHeadingContext: true, // For markdown/docs
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/markdown", config);
+
+// Build retrieval augmented context
+interface RagDocument {
+  id: string;
+  query_text: string;
+  context: string;
+  page: number | null;
+}
+
+const ragDocs: RagDocument[] =
+  result.chunks?.map((chunk, idx) => ({
+    id: `chunk_${idx}`,
+    query_text: chunk.content, // Text to embed and search
+    context: chunk.content, // Full context (includes heading)
+    page: chunk.metadata?.firstPage ?? null,
+  })) || [];
+
+console.log(`Built RAG documents: ${ragDocs.length}`);
+```
--- a/docs/snippets/wasm/advanced/custom-pipeline.ts
+++ b/docs/snippets/wasm/advanced/custom-pipeline.ts
@@ -0,0 +1,50 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface ProcessingStep {
+  name: string;
+  process: (result: ExtractionResult) => Promise<ExtractionResult>;
+}
+
+async function createExtractionPipeline(
+  steps: ProcessingStep[],
+  bytes: Uint8Array,
+  mimeType: string,
+) {
+  await initWasm();
+
+  let result = await extractBytes(bytes, mimeType);
+
+  for (const step of steps) {
+    console.log(`Executing step: ${step.name}`);
+    result = await step.process(result);
+  }
+
+  return result;
+}
+
+const pipeline: ProcessingStep[] = [
+  {
+    name: "Text Normalization",
+    process: async (result) => ({
+      ...result,
+      content: result.content.replace(/\s+/g, " ").trim(),
+    }),
+  },
+  {
+    name: "Language Detection",
+    process: async (result) => result,
+  },
+  {
+    name: "Chunking",
+    process: async (result) => result,
+  },
+];
+
+createExtractionPipeline(
+  pipeline,
+  new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
+  "application/pdf",
+)
+  .then((r) => console.log("Pipeline complete:", r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/error-recovery.ts
+++ b/docs/snippets/wasm/advanced/error-recovery.ts
@@ -0,0 +1,34 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
+  await initWasm();
+
+  let lastError: Error | null = null;
+
+  for (let attempt = 1; attempt <= maxRetries; attempt++) {
+    try {
+      console.log(`Extraction attempt ${attempt}/${maxRetries}`);
+      const result = await extractBytes(bytes, mimeType);
+      console.log("Extraction successful");
+      return result;
+    } catch (error) {
+      lastError = error instanceof Error ? error : new Error(String(error));
+      console.warn(`Attempt ${attempt} failed:`, lastError.message);
+
+      if (attempt < maxRetries) {
+        const delay = 2 ** attempt * 100;
+        console.log(`Retrying in ${delay}ms...`);
+        await new Promise((resolve) => setTimeout(resolve, delay));
+      }
+    }
+  }
+
+  throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
+}
+
+extractWithRetry(
+  new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
+  "application/pdf",
+)
+  .then((r) => console.log("Final result:", r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/keyword_extraction_config.md
+++ b/docs/snippets/wasm/advanced/keyword_extraction_config.md
@@ -0,0 +1,105 @@
+```typescript title="WASM - Keyword Extraction Setup"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Note: Keyword extraction requires the 'keywords' feature,
+// which may not be available in all WASM builds.
+// This example shows the configuration structure.
+
+const config = {
+  // Extraction configuration
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Keyword extraction would be performed on the extracted text
+// using external libraries or post-processing
+console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
+
+// Example post-processing to extract keywords
+// (requires external keyword extraction library)
+const keywords = new Set<string>();
+const words = result.content
+  .toLowerCase()
+  .split(/\s+/)
+  .filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
+
+words.forEach((word) => {
+  keywords.add(word);
+});
+
+console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
+```
+
+```typescript title="WASM - Keyword Filtering"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Common stopwords to exclude
+const stopwords = new Set([
+  "the",
+  "a",
+  "an",
+  "and",
+  "or",
+  "but",
+  "in",
+  "on",
+  "at",
+  "to",
+  "for",
+  "of",
+  "with",
+  "from",
+  "by",
+  "is",
+  "are",
+  "was",
+  "were",
+  "be",
+  "been",
+  "have",
+  "has",
+  "had",
+  "do",
+  "does",
+  "did",
+  "will",
+  "would",
+  "could",
+  "should",
+]);
+
+// Extract and filter keywords
+const text = result.content.toLowerCase();
+const words = text.split(/\s+/);
+const keywordCounts = new Map<string, number>();
+
+words.forEach((word) => {
+  const cleaned = word.replace(/[^\w]/g, "");
+  if (cleaned.length > 4 && !stopwords.has(cleaned)) {
+    keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
+  }
+});
+
+// Get top keywords by frequency
+const topKeywords = Array.from(keywordCounts.entries())
+  .sort((a, b) => b[1] - a[1])
+  .slice(0, 10)
+  .map(([word, count]) => `${word} (${count})`);
+
+console.log(`Top keywords: ${topKeywords.join(", ")}`);
+```
+
+<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.
--- a/docs/snippets/wasm/advanced/keyword_extraction_example.md
+++ b/docs/snippets/wasm/advanced/keyword_extraction_example.md
@@ -0,0 +1,110 @@
+```typescript title="WASM - Extract and Score Keywords"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// TF-IDF style keyword extraction (simplified)
+interface Keyword {
+  term: string;
+  frequency: number;
+  uniquePositions: number[];
+  score: number;
+}
+
+const text = result.content.toLowerCase();
+const words = text.split(/[\s\n\t]+/);
+const tokenMap = new Map<string, number[]>();
+
+// Record word positions
+words.forEach((word, idx) => {
+  const cleaned = word.replace(/[^\w]/g, "");
+  if (cleaned.length > 3) {
+    if (!tokenMap.has(cleaned)) {
+      tokenMap.set(cleaned, []);
+    }
+    tokenMap.get(cleaned)!.push(idx);
+  }
+});
+
+// Calculate keyword scores
+const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
+  term,
+  frequency: positions.length,
+  uniquePositions: positions,
+  score: positions.length * Math.log(words.length / positions.length),
+}));
+
+// Sort by score (TF-IDF approximation)
+keywords.sort((a, b) => b.score - a.score);
+
+// Top 15 keywords
+const topKeywords = keywords.slice(0, 15);
+console.log("Top Keywords:");
+topKeywords.forEach((kw) => {
+  console.log(`  ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
+});
+```
+
+```typescript title="WASM - Keyword Context Window"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  outputFormat: "markdown",
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 200,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface KeywordContext {
+  keyword: string;
+  contexts: string[];
+}
+
+// Find keyword occurrences with surrounding context
+function extractKeywordContexts(
+  text: string,
+  keyword: string,
+  contextWindow: number = 50,
+): string[] {
+  const contexts: string[] = [];
+  const regex = new RegExp(keyword, "gi");
+  let match;
+
+  while ((match = regex.exec(text)) !== null) {
+    const start = Math.max(0, match.index - contextWindow);
+    const end = Math.min(text.length, match.index + keyword.length + contextWindow);
+    contexts.push(text.substring(start, end));
+  }
+
+  return contexts;
+}
+
+// Extract context for top keywords
+const topKeywords = ["document", "analysis", "results"];
+const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
+  keyword: kw,
+  contexts: extractKeywordContexts(result.content, kw, 40),
+}));
+
+keywordContexts.forEach((kc) => {
+  console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
+  kc.contexts.slice(0, 2).forEach((ctx, idx) => {
+    console.log(`  [${idx + 1}] ...${ctx}...`);
+  });
+});
+```
+
+<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.
--- a/docs/snippets/wasm/advanced/language_detection_config.md
+++ b/docs/snippets/wasm/advanced/language_detection_config.md
@@ -0,0 +1,84 @@
+```typescript title="WASM - Basic Language Detection"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.75,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Primary language: ${result.metadata?.language}`);
+console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
+console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
+```
+
+```typescript title="WASM - Multi-Language Detection"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.6,
+    detectMultiple: true, // Enable detection of multiple languages
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Primary language: ${result.metadata?.language}`);
+console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
+console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
+
+// Use detected language for downstream processing
+if (result.detectedLanguages && result.detectedLanguages.length > 1) {
+  console.log("Document contains multiple languages - enable multilingual NLP processing");
+}
+```
+
+```typescript title="WASM - Language-Specific Extraction"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+    detectMultiple: false,
+  },
+  // Adjust extraction parameters based on detected language
+  quality: {
+    enableQualityProcessing: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/plain", config);
+
+// Process result based on detected language
+const language = result.metadata?.language;
+console.log(`Processing document in language: ${language}`);
+
+// Example: Apply language-specific rules
+const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
+  en: { cleanWhitespace: true, normalizeText: true },
+  zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
+  ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
+  ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
+};
+
+const langConfig = languageConfig[language as string] || {
+  cleanWhitespace: true,
+  normalizeText: true,
+};
+console.log(`Language config: ${JSON.stringify(langConfig)}`);
+```
--- a/docs/snippets/wasm/advanced/language_detection_multilingual.md
+++ b/docs/snippets/wasm/advanced/language_detection_multilingual.md
@@ -0,0 +1,171 @@
+```typescript title="WASM - Detect and Process Multilingual Content"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.7,
+    detectMultiple: true,
+  },
+  chunking: {
+    maxChars: 800,
+    chunkOverlap: 200,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface MultilingualChunk {
+  index: number;
+  text: string;
+  language: string;
+  confidence: number;
+}
+
+// Detect language for each chunk
+const multilingualChunks: MultilingualChunk[] =
+  result.chunks?.map((chunk, idx) => {
+    // Simple language detection based on character ranges
+    const text = chunk.content;
+    let detectedLang = result.metadata?.language || "en";
+    let confidence = result.metadata?.languageConfidence || 0.5;
+
+    // Check for specific character patterns
+    if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
+    if (/[぀-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
+    if (/[가-힯]/.test(text)) detectedLang = "ko"; // Korean
+    if (/[؀-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
+    if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
+
+    return {
+      index: idx,
+      text: text.substring(0, 50),
+      language: detectedLang,
+      confidence: confidence,
+    };
+  }) || [];
+
+// Group chunks by language
+const chunksByLanguage = new Map<string, MultilingualChunk[]>();
+multilingualChunks.forEach((chunk) => {
+  if (!chunksByLanguage.has(chunk.language)) {
+    chunksByLanguage.set(chunk.language, []);
+  }
+  chunksByLanguage.get(chunk.language)!.push(chunk);
+});
+
+console.log("Chunks by detected language:");
+chunksByLanguage.forEach((chunks, lang) => {
+  console.log(`  ${lang}: ${chunks.length} chunks`);
+  chunks.slice(0, 2).forEach((c) => {
+    console.log(`    Chunk ${c.index}: "${c.text}..."`);
+  });
+});
+```
+
+```typescript title="WASM - Language-Specific Text Processing"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  languageDetection: {
+    enabled: true,
+    detectMultiple: true,
+  },
+  outputFormat: "markdown",
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "text/plain", config);
+
+// Language-specific text normalization
+interface LanguageProcessor {
+  normalize: (text: string) => string;
+  splitSentences: (text: string) => string[];
+}
+
+const processors: Record<string, LanguageProcessor> = {
+  en: {
+    normalize: (t) => t.replace(/\s+/g, " ").trim(),
+    splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
+  },
+  zh: {
+    normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
+    splitSentences: (t) => t.split(/[。！？]+/).filter((s) => s.length > 0),
+  },
+  ja: {
+    normalize: (t) => t.replace(/\s+/g, "").trim(),
+    splitSentences: (t) => t.split(/[。！？]+/).filter((s) => s.length > 0),
+  },
+  ar: {
+    normalize: (t) => t.replace(/\s+/g, " ").trim(),
+    splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
+  },
+};
+
+const language = result.metadata?.language || "en";
+const processor = processors[language] || processors["en"];
+
+const normalized = processor.normalize(result.content);
+const sentences = processor.splitSentences(result.content);
+
+console.log(`Language: ${language}`);
+console.log(`Normalized length: ${normalized.length}`);
+console.log(`Detected sentences: ${sentences.length}`);
+sentences.slice(0, 3).forEach((sent, idx) => {
+  console.log(`  [${idx + 1}] ${sent.substring(0, 60)}...`);
+});
+```
+
+```typescript title="WASM - Multilingual Chunking Strategy"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface LanguageChunkConfig {
+  maxChars: number;
+  overlap: number;
+}
+
+// Different chunking strategies per language
+const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
+  en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
+  zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
+  ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
+  ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
+  default: { maxChars: 512, overlap: 128 },
+};
+
+// Detect language first
+const languageDetectConfig = {
+  languageDetection: {
+    enabled: true,
+    minConfidence: 0.8,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
+const detectedLang = langResult.metadata?.language || "en";
+
+// Re-extract with language-specific chunking
+const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
+const finalConfig = {
+  languageDetection: {
+    enabled: true,
+  },
+  chunking: {
+    maxChars: chunkConfig.maxChars,
+    chunkOverlap: chunkConfig.overlap,
+  },
+};
+
+const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
+console.log(`Language: ${detectedLang}`);
+console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
+console.log(`Generated ${finalResult.chunks?.length} chunks`);
+```
--- a/docs/snippets/wasm/advanced/memory-management.ts
+++ b/docs/snippets/wasm/advanced/memory-management.ts
@@ -0,0 +1,37 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function processLargeDocumentSet(files: File[]) {
+  await initWasm();
+
+  const BATCH_SIZE = 5;
+  const results: any[] = [];
+
+  for (let i = 0; i < files.length; i += BATCH_SIZE) {
+    const batch = files.slice(i, i + BATCH_SIZE);
+
+    console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
+
+    const batchResults = await Promise.all(
+      batch.map(async (file) => {
+        const arrayBuffer = await file.arrayBuffer();
+        const bytes = new Uint8Array(arrayBuffer);
+        return extractBytes(bytes, file.type);
+      }),
+    );
+
+    results.push(...batchResults);
+
+    if (global.gc) {
+      console.log("Running garbage collection");
+      global.gc();
+    }
+
+    await new Promise((resolve) => setTimeout(resolve, 100));
+  }
+
+  return results;
+}
+
+processLargeDocumentSet([])
+  .then(() => console.log("Done"))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/parallel-extraction.ts
+++ b/docs/snippets/wasm/advanced/parallel-extraction.ts
@@ -0,0 +1,43 @@
+import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
+
+async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
+  await initWasm();
+
+  const _runtime = detectRuntime();
+  const canUseWorkers = hasWorkers();
+
+  if (isBrowser() && canUseWorkers) {
+    return extractWithWebWorkers(documents, mimeTypes);
+  }
+
+  return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
+}
+
+function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
+  const workerCount = navigator.hardwareConcurrency ?? 2;
+  const workers: Worker[] = [];
+
+  for (let i = 0; i < workerCount; i++) {
+    workers.push(new Worker("extraction-worker.js"));
+  }
+
+  return Promise.all(
+    documents.map(
+      (bytes, index) =>
+        new Promise((resolve, reject) => {
+          const worker = workers[index % workers.length];
+          worker.postMessage({ bytes, mimeType: mimeTypes[index] });
+          worker.onmessage = (e) => resolve(e.data);
+          worker.onerror = reject;
+        }),
+    ),
+  );
+}
+
+function isBrowser() {
+  return typeof window !== "undefined";
+}
+
+extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
+  .then((results) => console.log(results))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/quality_processing_config.md
+++ b/docs/snippets/wasm/advanced/quality_processing_config.md
@@ -0,0 +1,103 @@
+```typescript title="WASM - Enable Quality Processing"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+console.log(`Content: ${result.content.substring(0, 100)}...`);
+
+// Quality score indicates text extraction quality (0.0-1.0)
+if (result.qualityScore && result.qualityScore < 0.5) {
+  console.warn("Low quality extraction detected - consider OCR or alternative processing");
+}
+```
+
+```typescript title="WASM - Quality Monitoring"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface ExtractionQuality {
+  contentLength: number;
+  qualityScore: number | null;
+  assessedAs: string;
+}
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const quality: ExtractionQuality = {
+  contentLength: result.content.length,
+  qualityScore: result.qualityScore || null,
+  assessedAs: result.qualityScore
+    ? result.qualityScore > 0.8
+      ? "high"
+      : result.qualityScore > 0.5
+        ? "medium"
+        : "low"
+    : "unknown",
+};
+
+console.log("Extraction Quality Report:");
+console.log(`  Content size: ${quality.contentLength} bytes`);
+console.log(`  Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
+console.log(`  Assessment: ${quality.assessedAs}`);
+
+if (quality.assessedAs === "low") {
+  console.log(
+    "  Recommendation: Review raw text for encoding issues or consider alternative extraction",
+  );
+}
+```
+
+```typescript title="WASM - Quality with OCR Fallback"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+async function extractWithQualityCheck(
+  bytes: Uint8Array,
+  mimeType: string,
+): Promise<{ content: string; quality: number | null; method: string }> {
+  const config = {
+    enableQualityProcessing: true,
+  };
+
+  const result = await extractBytes(bytes, mimeType, config);
+  const qualityScore = result.qualityScore || 0;
+
+  // If quality is low, consider text extraction failed or use OCR
+  if (qualityScore < 0.5) {
+    console.warn("Low quality text extraction - alternative processing recommended");
+    return {
+      content: result.content,
+      quality: qualityScore,
+      method: "degraded-text-extraction",
+    };
+  }
+
+  return {
+    content: result.content,
+    quality: qualityScore,
+    method: "text-extraction",
+  };
+}
+
+const bytes = new Uint8Array(buffer);
+const extracted = await extractWithQualityCheck(bytes, "application/pdf");
+
+console.log(`Extraction method: ${extracted.method}`);
+console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
+console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
+```
--- a/docs/snippets/wasm/advanced/quality_processing_example.md
+++ b/docs/snippets/wasm/advanced/quality_processing_example.md
@@ -0,0 +1,161 @@
+```typescript title="WASM - Assess Text Quality"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface TextQualityMetrics {
+  contentLength: number;
+  lineCount: number;
+  averageLineLength: number;
+  emptyLineRatio: number;
+  specialCharRatio: number;
+  estimatedLanguages: string[];
+}
+
+function assessTextQuality(content: string): TextQualityMetrics {
+  const lines = content.split(/\n+/);
+  const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
+  const totalChars = content.length;
+  const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
+
+  // Simple language detection by character patterns
+  const detectedLangs: string[] = [];
+  if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
+  if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
+  if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
+  if (/[가-힯]/.test(content)) detectedLangs.push("ko");
+
+  return {
+    contentLength: totalChars,
+    lineCount: lines.length,
+    averageLineLength:
+      nonEmptyLines.length > 0
+        ? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
+        : 0,
+    emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
+    specialCharRatio: specialChars / totalChars,
+    estimatedLanguages: detectedLangs,
+  };
+}
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const metrics = assessTextQuality(result.content);
+
+console.log("Text Quality Assessment:");
+console.log(`  Length: ${metrics.contentLength} characters`);
+console.log(
+  `  Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
+);
+console.log(`  Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
+console.log(`  Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
+console.log(`  Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
+console.log(`  Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+```
+
+```typescript title="WASM - Quality-Based Content Filtering"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+  chunking: {
+    maxChars: 1000,
+    chunkOverlap: 200,
+    trim: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+interface QualityFilteredChunk {
+  index: number;
+  content: string;
+  quality: number;
+  kept: boolean;
+}
+
+// Filter chunks based on quality heuristics
+const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
+const filteredChunks: QualityFilteredChunk[] =
+  result.chunks?.map((chunk, idx) => {
+    const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
+    const hasNumbers = /\d/.test(chunk.content);
+    const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
+
+    // Quality score based on content characteristics
+    const contentQuality =
+      (nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
+    const kept = contentQuality >= qualityThreshold;
+
+    return {
+      index: idx,
+      content: chunk.content.substring(0, 50),
+      quality: contentQuality,
+      kept,
+    };
+  }) || [];
+
+const keptChunks = filteredChunks.filter((c) => c.kept);
+console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
+
+keptChunks.slice(0, 3).forEach((c) => {
+  console.log(`  Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
+});
+```
+
+```typescript title="WASM - Content Encoding Validation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  enableQualityProcessing: true,
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Validate text encoding quality
+interface EncodingValidation {
+  hasInvalidChars: boolean;
+  invalidCharCount: number;
+  replacementRatio: number;
+  estimatedEncoding: string;
+}
+
+function validateEncoding(content: string): EncodingValidation {
+  // Check for replacement characters (U+FFFD)
+  const replacementChars = (content.match(/<2F>/g) || []).length;
+  const hasInvalidChars = replacementChars > 0;
+  const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
+
+  // Guess encoding based on content patterns
+  const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
+
+  return {
+    hasInvalidChars,
+    invalidCharCount: replacementChars,
+    replacementRatio,
+    estimatedEncoding,
+  };
+}
+
+const validation = validateEncoding(result.content);
+
+console.log("Content Encoding Validation:");
+console.log(`  Estimated encoding: ${validation.estimatedEncoding}`);
+console.log(`  Invalid characters: ${validation.invalidCharCount}`);
+console.log(`  Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
+console.log(
+  `  Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
+);
+console.log(`  Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
+```
--- a/docs/snippets/wasm/advanced/streaming-extraction.ts
+++ b/docs/snippets/wasm/advanced/streaming-extraction.ts
@@ -0,0 +1,39 @@
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+async function extractStreamingDocument(url: string) {
+  await initWasm();
+
+  const response = await fetch(url);
+  if (!response.ok) throw new Error(`HTTP ${response.status}`);
+
+  const reader = response.body?.getReader();
+  if (!reader) throw new Error("No response body");
+
+  const chunks: Uint8Array[] = [];
+  let totalSize = 0;
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+
+    chunks.push(value);
+    totalSize += value.length;
+    console.log(`Received ${totalSize} bytes`);
+  }
+
+  const fullBuffer = new Uint8Array(totalSize);
+  let offset = 0;
+  for (const chunk of chunks) {
+    fullBuffer.set(chunk, offset);
+    offset += chunk.length;
+  }
+
+  console.log("Document fully received, extracting...");
+  const result = await extractBytes(fullBuffer, "application/pdf");
+
+  return result;
+}
+
+extractStreamingDocument("https://example.com/document.pdf")
+  .then((r) => console.log(r))
+  .catch(console.error);
--- a/docs/snippets/wasm/advanced/token_reduction_config.md
+++ b/docs/snippets/wasm/advanced/token_reduction_config.md
@@ -0,0 +1,75 @@
+```typescript title="WASM - Token Reduction Configuration"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "aggressive",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+console.log(`Original content: ${result.content.length} characters`);
+console.log(`Preview: ${result.content.substring(0, 100)}...`);
+
+// Token reduction modes:
+// - "aggressive": maximum reduction
+// - "balanced": moderate reduction
+// - "conservative": minimal reduction
+```
+
+```typescript title="WASM - Token Reduction Modes"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface TokenReductionResult {
+  mode: string;
+  originalSize: number;
+  reducedSize: number;
+  reductionRatio: number;
+  preview: string;
+}
+
+async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
+  const modes = ["conservative", "balanced", "aggressive"];
+  const results: TokenReductionResult[] = [];
+
+  for (const mode of modes) {
+    const config = {
+      tokenReduction: {
+        mode,
+        preserveImportantWords: true,
+      },
+    };
+
+    const result = await extractBytes(bytes, "application/pdf", config);
+    const originalSize = result.content.length;
+    const reducedSize = result.content.split(/\s+/).length;
+
+    results.push({
+      mode,
+      originalSize,
+      reducedSize,
+      reductionRatio: 1 - reducedSize / originalSize,
+      preview: result.content.substring(0, 80),
+    });
+  }
+
+  return results;
+}
+
+const bytes = new Uint8Array(buffer);
+const modeComparison = await compareTokenReductionModes(bytes);
+
+console.log("Token Reduction Mode Comparison:");
+modeComparison.forEach((r) => {
+  console.log(`  ${r.mode}:`);
+  console.log(`    Original: ${r.originalSize} chars`);
+  console.log(`    Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
+});
+```
--- a/docs/snippets/wasm/advanced/token_reduction_example.md
+++ b/docs/snippets/wasm/advanced/token_reduction_example.md
@@ -0,0 +1,149 @@
+```typescript title="WASM - Token Counting and Cost Estimation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "balanced",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Simple token counting (approximation: 1 token ≈ 4 chars)
+function estimateTokenCount(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+// LLM pricing (example: GPT-4 Turbo)
+interface PricingEstimate {
+  tokenCount: number;
+  inputCost: number;
+  outputCostEstimate: number;
+  totalEstimate: number;
+}
+
+const tokenCount = estimateTokenCount(result.content);
+const inputPricePerToken = 0.00001; // $0.01/1K tokens
+const outputPricePerToken = 0.00003; // $0.03/1K tokens
+
+const costEstimate: PricingEstimate = {
+  tokenCount,
+  inputCost: tokenCount * inputPricePerToken,
+  outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
+  totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
+};
+
+console.log("Token and Cost Analysis:");
+console.log(`  Estimated tokens: ${costEstimate.tokenCount}`);
+console.log(`  Input cost: $${costEstimate.inputCost.toFixed(6)}`);
+console.log(`  Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
+console.log(`  Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
+```
+
+```typescript title="WASM - Token Reduction for Context Windows"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+interface ContextWindowFit {
+  contentLength: number;
+  estimatedTokens: number;
+  fitsInWindow: boolean;
+  utilization: number;
+}
+
+function checkContextWindowFit(
+  content: string,
+  contextWindowSize: number = 4096,
+): ContextWindowFit {
+  const estimatedTokens = Math.ceil(content.length / 4);
+  const fitsInWindow = estimatedTokens < contextWindowSize;
+  const utilization = estimatedTokens / contextWindowSize;
+
+  return {
+    contentLength: content.length,
+    estimatedTokens,
+    fitsInWindow,
+    utilization,
+  };
+}
+
+const config = {
+  tokenReduction: {
+    mode: "aggressive", // Use aggressive mode for large documents
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+const contextFit = checkContextWindowFit(result.content, 4096);
+
+console.log("Context Window Analysis:");
+console.log(`  Content: ${contextFit.contentLength} characters`);
+console.log(`  Tokens (est.): ${contextFit.estimatedTokens}`);
+console.log(`  Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
+console.log(`  Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
+
+if (!contextFit.fitsInWindow) {
+  console.log("  Note: Consider chunking or more aggressive token reduction");
+}
+```
+
+```typescript title="WASM - Selective Token Preservation"
+import init, { extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const config = {
+  tokenReduction: {
+    mode: "balanced",
+    preserveImportantWords: true,
+  },
+};
+
+const bytes = new Uint8Array(buffer);
+const result = await extractBytes(bytes, "application/pdf", config);
+
+// Extract important terms manually
+interface ImportantTerm {
+  term: string;
+  frequency: number;
+  importance: number;
+}
+
+function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
+  const words = content.toLowerCase().split(/\s+/);
+  const frequencyMap = new Map<string, number>();
+
+  words.forEach((word) => {
+    const cleaned = word.replace(/[^\w]/g, "");
+    if (cleaned.length > 5) {
+      // Only consider longer words
+      frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
+    }
+  });
+
+  return Array.from(frequencyMap.entries())
+    .filter(([_, freq]) => freq >= threshold)
+    .map(([term, freq]) => ({
+      term,
+      frequency: freq,
+      importance: Math.log(freq) * (term.length / 10),
+    }))
+    .sort((a, b) => b.importance - a.importance)
+    .slice(0, 20);
+}
+
+const importantTerms = extractImportantTerms(result.content);
+
+console.log("Important Terms (likely preserved by token reduction):");
+importantTerms.forEach((t) => {
+  console.log(`  "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
+});
+```
--- a/docs/snippets/wasm/advanced/worker-extraction.ts
+++ b/docs/snippets/wasm/advanced/worker-extraction.ts
@@ -0,0 +1,57 @@
+class ExtractionWorker {
+  private workerPool: Worker[] = [];
+  private taskQueue: Array<{
+    bytes: Uint8Array;
+    mimeType: string;
+    resolve: (value: any) => void;
+    reject: (error: any) => void;
+  }> = [];
+
+  constructor(workerCount?: number) {
+    const count =
+      workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
+    for (let i = 0; i < count; i++) {
+      const worker = new Worker("extraction-worker.js");
+      worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
+      worker.onerror = (e) => this.handleWorkerError(worker, e);
+      this.workerPool.push(worker);
+    }
+  }
+
+  async extract(bytes: Uint8Array, mimeType: string) {
+    return new Promise((resolve, reject) => {
+      this.taskQueue.push({ bytes, mimeType, resolve, reject });
+      this.processTasks();
+    });
+  }
+
+  private processTasks() {
+    const availableWorker = this.workerPool[0];
+    const task = this.taskQueue.shift();
+
+    if (!task || !availableWorker) return;
+
+    availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
+    this.workerPool.push(this.workerPool.shift()!);
+  }
+
+  private handleWorkerMessage(_worker: Worker, result: any) {
+    const task = this.taskQueue.shift();
+    if (task) {
+      task.resolve(result);
+      this.processTasks();
+    }
+  }
+
+  private handleWorkerError(_worker: Worker, error: any) {
+    const task = this.taskQueue.shift();
+    if (task) {
+      task.reject(error);
+      this.processTasks();
+    }
+  }
+
+  terminate() {
+    this.workerPool.forEach((w) => w.terminate());
+  }
+}