Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/wasm/plugins/clear_plugins.md
+++ b/docs/snippets/wasm/plugins/clear_plugins.md
@@ -0,0 +1,23 @@
+# Clear All Registered Plugins
+
+Clear all registered OCR backends, post-processors, or validators from the global registry.
+
+```typescript title="WASM"
+import init, { clearOcrBackends, clearPostProcessors, clearValidators } from "kreuzberg-wasm";
+
+await init();
+
+// Clear all OCR backends
+clearOcrBackends();
+console.log("OCR backends cleared");
+
+// Clear all post-processors
+clearPostProcessors();
+console.log("Post-processors cleared");
+
+// Clear all validators
+clearValidators();
+console.log("Validators cleared");
+```
+
+Use when you need to reset the plugin registries to their initial state or remove all custom plugins.
--- a/docs/snippets/wasm/plugins/embedding_backend.md
+++ b/docs/snippets/wasm/plugins/embedding_backend.md
@@ -0,0 +1,34 @@
+# Register Custom Embedding Backend
+
+Register a custom embedding backend that provides vector embeddings for text.
+
+```typescript title="WASM"
+import init, { registerEmbeddingBackend } from "kreuzberg-wasm";
+
+await init();
+
+// Define a custom embedding backend
+const customEmbedding = {
+  dimensions: () => 384,
+  embed: (texts) => {
+    // Return embeddings for each text
+    return texts.map((text) => {
+      // Generate a dummy 384-dimensional vector
+      const vector = new Array(384).fill(0).map((_, i) => Math.sin((text.charCodeAt(0) + i) / 384));
+      return vector;
+    });
+  },
+};
+
+try {
+  registerEmbeddingBackend(customEmbedding);
+  console.log("Custom embedding backend registered");
+} catch (error) {
+  console.error("Failed to register embedding backend:", error);
+}
+```
+
+The embedding backend must implement:
+
+- `dimensions()`: Returns the dimensionality of the embeddings
+- `embed(texts: string[])`: Computes vector embeddings for the given texts
--- a/docs/snippets/wasm/plugins/extractor_registration.md
+++ b/docs/snippets/wasm/plugins/extractor_registration.md
@@ -0,0 +1,67 @@
+# Document Extractor Registration
+
+Register a custom document extractor plugin in WASM that implements the required interface.
+
+```typescript title="WASM"
+import init, {
+  registerDocumentExtractor,
+  unregisterDocumentExtractor,
+  listDocumentExtractors,
+  extractBytes,
+} from "kreuzberg-wasm";
+
+await init();
+
+// Define a custom extractor as a plain JS object with required methods
+const customExtractor = {
+  // Required: extract document bytes
+  // Takes: (bytes: Uint8Array, mimeType: string, config: object) -> Promise<{text: string, ...}>
+  extractBytes: async (bytes, mimeType, config) => {
+    if (mimeType !== "application/x-custom") {
+      throw new Error("Unsupported MIME type");
+    }
+    // Custom extraction logic
+    const text = new TextDecoder().decode(bytes);
+    return JSON.stringify({
+      text: `Extracted: ${text.slice(0, 100)}`,
+      page_count: 1,
+      language: "en",
+    });
+  },
+
+  // Required: list supported MIME types as JSON array
+  supportedMimeTypes: () => {
+    return JSON.stringify(["application/x-custom"]);
+  },
+
+  // Optional: plugin name (returned by Plugin trait)
+  version: () => "1.0.0",
+};
+
+// Register the custom extractor
+try {
+  registerDocumentExtractor(customExtractor);
+  console.log("Extractor registered successfully");
+} catch (error) {
+  console.error("Failed to register extractor:", error);
+}
+
+// List all extractors (includes your custom one)
+const extractors = listDocumentExtractors();
+console.log("Available extractors:", extractors);
+
+// Use the custom extractor via normal extraction
+const customBytes = new Uint8Array([0x00, 0x01, 0x02]);
+const result = await extractBytes(customBytes, "application/x-custom", {});
+console.log("Extraction result:", result);
+
+// Unregister when done
+try {
+  unregisterDocumentExtractor("wasm_bridge");
+  console.log("Extractor unregistered");
+} catch (error) {
+  console.error("Failed to unregister:", error);
+}
+```
+
+The extractor object must implement `extractBytes` and `supportedMimeTypes` methods. Optional methods: `initialize()`, `shutdown()`, and `version()` for lifecycle management.
--- a/docs/snippets/wasm/plugins/list_plugins.md
+++ b/docs/snippets/wasm/plugins/list_plugins.md
@@ -0,0 +1,39 @@
+# List Registered Plugins
+
+List all registered plugins of each type: OCR backends, post-processors, validators, and document extractors.
+
+```typescript title="WASM"
+import init, {
+  listDocumentExtractors,
+  listOcrBackends,
+  listPostProcessors,
+  listValidators,
+} from "kreuzberg-wasm";
+
+await init();
+
+// List all document extractors
+const extractors = listDocumentExtractors();
+console.log("Document extractors:", extractors);
+
+// List all OCR backends
+const ocrBackends = listOcrBackends();
+console.log("OCR backends:", ocrBackends);
+
+// List all post-processors
+const processors = listPostProcessors();
+console.log("Post-processors:", processors);
+
+// List all validators
+const validators = listValidators();
+console.log("Validators:", validators);
+
+// Count registered plugins
+console.log(`Total plugins registered:
+  Extractors: ${extractors.length}
+  OCR backends: ${ocrBackends.length}
+  Post-processors: ${processors.length}
+  Validators: ${validators.length}`);
+```
+
+Use this to verify which plugins are available before extraction or to debug plugin registration issues.
--- a/docs/snippets/wasm/plugins/min_length_validator.md
+++ b/docs/snippets/wasm/plugins/min_length_validator.md
@@ -0,0 +1,51 @@
+# Minimum Length Text Validator
+
+Register a validator that filters out extraction results with text below a minimum length threshold.
+
+```typescript title="WASM"
+import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+const MIN_LENGTH = 10;
+
+// Define a minimum length validator
+const minLengthValidator = {
+  validate: (extractionResult) => {
+    const textLength = extractionResult.text?.length || 0;
+
+    if (textLength < MIN_LENGTH) {
+      return {
+        valid: false,
+        error: `Text too short: ${textLength} < ${MIN_LENGTH}`,
+      };
+    }
+
+    return {
+      valid: true,
+      error: null,
+    };
+  },
+};
+
+try {
+  registerValidator(minLengthValidator);
+  console.log(`Min length validator registered (threshold: ${MIN_LENGTH})`);
+} catch (error) {
+  console.error("Failed to register validator:", error);
+}
+
+// Now extract with validation enabled
+const pdfBytes = new Uint8Array([
+  /* PDF content */
+]);
+const config = {
+  ocr: null,
+  chunking: null,
+};
+
+const result = await extractBytes(pdfBytes, "application/pdf", config);
+console.log("Validated result:", result);
+```
+
+This validator ensures extracted text meets minimum quality standards by checking length.
--- a/docs/snippets/wasm/plugins/ocr-backend-custom.ts
+++ b/docs/snippets/wasm/plugins/ocr-backend-custom.ts
@@ -0,0 +1,57 @@
+import {
+  initWasm,
+  listOcrBackends,
+  registerOcrBackend,
+  unregisterOcrBackend,
+} from "@kreuzberg/wasm";
+
+class CustomOcrBackend {
+  private name: string = "custom-ocr";
+  private enabled: boolean = true;
+
+  async initialize(): Promise<void> {
+    console.log("Initializing custom OCR backend");
+  }
+
+  async recognize(imageData: Uint8Array, language: string): Promise<string> {
+    console.log(`Recognizing text in ${language} from ${imageData.byteLength} bytes`);
+    return "Placeholder OCR result";
+  }
+
+  getName(): string {
+    return this.name;
+  }
+
+  isEnabled(): boolean {
+    return this.enabled;
+  }
+
+  setEnabled(enabled: boolean) {
+    this.enabled = enabled;
+  }
+
+  async cleanup(): Promise<void> {
+    console.log("Cleaning up custom OCR backend");
+  }
+}
+
+async function demonstrateCustomBackend() {
+  await initWasm();
+
+  const backend = new CustomOcrBackend();
+  await backend.initialize();
+
+  registerOcrBackend(backend);
+
+  const backends = listOcrBackends();
+  console.log("Registered backends:", backends);
+
+  unregisterOcrBackend("custom-ocr");
+
+  const afterUnregister = listOcrBackends();
+  console.log("Backends after unregister:", afterUnregister);
+
+  await backend.cleanup();
+}
+
+demonstrateCustomBackend().catch(console.error);
--- a/docs/snippets/wasm/plugins/pdf_metadata_extractor.md
+++ b/docs/snippets/wasm/plugins/pdf_metadata_extractor.md
@@ -0,0 +1,49 @@
+# PDF Metadata Post-Processor
+
+Register a post-processor that extracts and enriches extraction results with PDF metadata.
+
+```typescript title="WASM"
+import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Define a PDF metadata extractor post-processor
+const pdfMetadataProcessor = {
+  processingStage: () => "post-extraction",
+  process: (extractionResult) => {
+    // Enrich extraction with metadata
+    const enriched = {
+      ...extractionResult,
+      metadata: {
+        ...extractionResult.metadata,
+        processorName: "pdf-metadata",
+        processedAt: new Date().toISOString(),
+        wordCount: (extractionResult.text || "").split(/\s+/).length,
+      },
+    };
+
+    return enriched;
+  },
+};
+
+try {
+  registerPostProcessor(pdfMetadataProcessor);
+  console.log("PDF metadata post-processor registered");
+} catch (error) {
+  console.error("Failed to register post-processor:", error);
+}
+
+// Extract with post-processing
+const pdfBytes = new Uint8Array([
+  /* PDF content */
+]);
+const config = {
+  ocr: null,
+  chunking: null,
+};
+
+const result = await extractBytes(pdfBytes, "application/pdf", config);
+console.log("Enriched metadata:", result.metadata);
+```
+
+The post-processor runs after extraction to enrich or transform the results.
--- a/docs/snippets/wasm/plugins/pdf_only_processor.md
+++ b/docs/snippets/wasm/plugins/pdf_only_processor.md
@@ -0,0 +1,71 @@
+# PDF-Only Post-Processor
+
+Register a post-processor that only processes PDF documents and filters others.
+
+```typescript title="WASM"
+import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Define a PDF-only post-processor
+const pdfOnlyProcessor = {
+  processingStage: () => "post-extraction",
+  process: (extractionResult) => {
+    // Check if this is a PDF extraction
+    const isPdf =
+      extractionResult.metadata?.mimeType === "application/pdf" ||
+      extractionResult.metadata?.source?.endsWith(".pdf");
+
+    if (!isPdf) {
+      // Skip processing for non-PDF documents
+      return extractionResult;
+    }
+
+    // Apply PDF-specific processing
+    const processed = {
+      ...extractionResult,
+      metadata: {
+        ...extractionResult.metadata,
+        pdfProcessed: true,
+        pageCount: extractionResult.metadata?.pageCount || 1,
+      },
+      // Normalize text for PDFs
+      text: (extractionResult.text || "")
+        .replace(/\n{3,}/g, "\n\n") // Remove excessive line breaks
+        .trim(),
+    };
+
+    return processed;
+  },
+};
+
+try {
+  registerPostProcessor(pdfOnlyProcessor);
+  console.log("PDF-only post-processor registered");
+} catch (error) {
+  console.error("Failed to register post-processor:", error);
+}
+
+// Test with various documents
+const testDocs = [
+  {
+    bytes: new Uint8Array([
+      /* PDF */
+    ]),
+    type: "application/pdf",
+  },
+  {
+    bytes: new Uint8Array([
+      /* HTML */
+    ]),
+    type: "text/html",
+  },
+];
+
+for (const doc of testDocs) {
+  const result = await extractBytes(doc.bytes, doc.type, {});
+  console.log(`${doc.type}: PDF-specific processing applied:`, result.metadata?.pdfProcessed);
+}
+```
+
+This processor applies PDF-specific transformations only to PDF documents.
--- a/docs/snippets/wasm/plugins/plugin-lifecycle.ts
+++ b/docs/snippets/wasm/plugins/plugin-lifecycle.ts
@@ -0,0 +1,66 @@
+import { initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
+
+class PluginManager {
+  private plugins: Map<string, any> = new Map();
+
+  async registerPlugin(name: string, plugin: any): Promise<void> {
+    console.log(`Registering plugin: ${name}`);
+
+    if (plugin.initialize) {
+      await plugin.initialize();
+    }
+
+    this.plugins.set(name, plugin);
+    console.log(`Plugin ${name} registered successfully`);
+  }
+
+  async unregisterPlugin(name: string): Promise<void> {
+    const plugin = this.plugins.get(name);
+    if (!plugin) {
+      console.warn(`Plugin ${name} not found`);
+      return;
+    }
+
+    if (plugin.cleanup) {
+      await plugin.cleanup();
+    }
+
+    this.plugins.delete(name);
+    console.log(`Plugin ${name} unregistered`);
+  }
+
+  listPlugins(): string[] {
+    return Array.from(this.plugins.keys());
+  }
+
+  async reloadPlugin(name: string): Promise<void> {
+    const plugin = this.plugins.get(name);
+    if (!plugin) {
+      console.warn(`Plugin ${name} not found`);
+      return;
+    }
+
+    console.log(`Reloading plugin: ${name}`);
+    await this.unregisterPlugin(name);
+    await this.registerPlugin(name, plugin);
+  }
+}
+
+async function demonstratePluginLifecycle() {
+  await initWasm();
+
+  const manager = new PluginManager();
+
+  const backend = new TesseractWasmBackend();
+  await manager.registerPlugin("tesseract", backend);
+
+  console.log("Active plugins:", manager.listPlugins());
+
+  await manager.reloadPlugin("tesseract");
+
+  await manager.unregisterPlugin("tesseract");
+
+  console.log("Active plugins:", manager.listPlugins());
+}
+
+demonstratePluginLifecycle().catch(console.error);
--- a/docs/snippets/wasm/plugins/plugin-pipeline.ts
+++ b/docs/snippets/wasm/plugins/plugin-pipeline.ts
@@ -0,0 +1,55 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface Plugin {
+  name: string;
+  execute: (result: ExtractionResult) => Promise<ExtractionResult>;
+}
+
+class TextCleanerPlugin implements Plugin {
+  name = "text-cleaner";
+
+  async execute(result: ExtractionResult): Promise<ExtractionResult> {
+    const cleaned = result.content.replace(/\x00/g, "").replace(/\s+/g, " ").trim();
+
+    return { ...result, content: cleaned };
+  }
+}
+
+class MetadataEnricherPlugin implements Plugin {
+  name = "metadata-enricher";
+
+  async execute(result: ExtractionResult): Promise<ExtractionResult> {
+    return {
+      ...result,
+      metadata: {
+        ...result.metadata,
+        processedAt: new Date().toISOString(),
+        contentLength: result.content.length,
+      },
+    };
+  }
+}
+
+async function executePipeline(
+  bytes: Uint8Array,
+  mimeType: string,
+  plugins: Plugin[],
+): Promise<ExtractionResult> {
+  await initWasm();
+
+  let result = await extractBytes(bytes, mimeType);
+
+  for (const plugin of plugins) {
+    console.log(`Executing plugin: ${plugin.name}`);
+    result = await plugin.execute(result);
+  }
+
+  return result;
+}
+
+const pipeline = [new TextCleanerPlugin(), new MetadataEnricherPlugin()];
+
+executePipeline(new Uint8Array([1, 2, 3]), "application/pdf", pipeline)
+  .then((r) => console.log("Pipeline complete", r))
+  .catch(console.error);
--- a/docs/snippets/wasm/plugins/plugin_extractor.md
+++ b/docs/snippets/wasm/plugins/plugin_extractor.md
@@ -0,0 +1,53 @@
+# Custom Plugin Usage Pattern
+
+Demonstrate the pattern for using registered plugins during document extraction.
+
+```typescript title="WASM"
+import init, { extractBytes, registerPostProcessor } from "kreuzberg-wasm";
+
+await init();
+
+// Register a custom post-processor
+const customProcessor = {
+  processingStage: () => "post-extraction",
+  process: (result) => {
+    console.log("Post-processor: enriching extraction result");
+    return {
+      ...result,
+      metadata: {
+        ...result.metadata,
+        enriched: true,
+        processorApplied: "customProcessor",
+      },
+    };
+  },
+};
+
+registerPostProcessor(customProcessor);
+
+// Extract document with registered plugin
+async function extractWithPlugins(fileBytes, mimeType) {
+  const config = {
+    ocr: null,
+    chunking: null,
+    enableQualityProcessing: false,
+  };
+
+  // Extraction automatically applies registered post-processors
+  const result = await extractBytes(fileBytes, mimeType, config);
+
+  console.log("Extraction complete");
+  console.log("Plugins applied:", result.metadata?.enriched);
+
+  return result;
+}
+
+// Usage
+const pdfBytes = new Uint8Array([
+  /* PDF content */
+]);
+const result = await extractWithPlugins(pdfBytes, "application/pdf");
+console.log("Final result:", result);
+```
+
+The extraction pipeline automatically applies all registered plugins in the correct order.
--- a/docs/snippets/wasm/plugins/plugin_logging.md
+++ b/docs/snippets/wasm/plugins/plugin_logging.md
@@ -0,0 +1,70 @@
+# Plugin Logging and Debugging
+
+Log plugin registration and execution for debugging purposes.
+
+```typescript title="WASM"
+import init, {
+  registerPostProcessor,
+  registerValidator,
+  registerOcrBackend,
+  listPostProcessors,
+  listValidators,
+  listOcrBackends,
+} from "kreuzberg-wasm";
+
+await init();
+
+// Track plugin registrations
+const pluginLog = {
+  processors: [],
+  validators: [],
+  ocrBackends: [],
+};
+
+// Register a logging post-processor
+const loggingProcessor = {
+  processingStage: () => "post-extraction",
+  process: (result) => {
+    console.log("[POST-PROCESSOR] Processing extraction result", {
+      textLength: result.text?.length,
+      hasMetadata: !!result.metadata,
+    });
+    return result;
+  },
+};
+
+registerPostProcessor(loggingProcessor);
+pluginLog.processors.push("loggingProcessor");
+
+// Register a logging validator
+const loggingValidator = {
+  validate: (result) => {
+    console.log("[VALIDATOR] Validating extraction result", {
+      textLength: result.text?.length,
+      isValid: true,
+    });
+    return { valid: true, error: null };
+  },
+};
+
+registerValidator(loggingValidator);
+pluginLog.validators.push("loggingValidator");
+
+// Log registered plugins
+function logPluginStatus() {
+  const processors = listPostProcessors();
+  const validators = listValidators();
+  const backends = listOcrBackends();
+
+  console.log("Plugin Registration Status:", {
+    postProcessors: processors,
+    validators: validators,
+    ocrBackends: backends,
+    total: processors.length + validators.length + backends.length,
+  });
+}
+
+logPluginStatus();
+```
+
+Use this pattern to monitor and debug plugin lifecycle and execution.
--- a/docs/snippets/wasm/plugins/plugin_testing.md
+++ b/docs/snippets/wasm/plugins/plugin_testing.md
@@ -0,0 +1,74 @@
+# Plugin Testing Pattern
+
+Test custom plugins to verify they implement required interfaces correctly.
+
+```typescript title="WASM"
+import init, { registerValidator, registerPostProcessor } from "kreuzberg-wasm";
+
+await init();
+
+// Test fixture: sample extraction result
+const sampleResult = {
+  text: "Sample extracted text from document",
+  metadata: {
+    mimeType: "application/pdf",
+    source: "test.pdf",
+    pageCount: 1,
+  },
+};
+
+// Test post-processor registration
+function testPostProcessorRegistration() {
+  const processor = {
+    processingStage: () => "post-extraction",
+    process: (result) => result,
+  };
+
+  try {
+    registerPostProcessor(processor);
+    console.log("✓ Post-processor registered successfully");
+  } catch (error) {
+    console.error("✗ Post-processor registration failed:", error);
+  }
+}
+
+// Test validator registration
+function testValidatorRegistration() {
+  const validator = {
+    validate: (result) => ({
+      valid: !!result.text,
+      error: result.text ? null : "No text extracted",
+    }),
+  };
+
+  try {
+    registerValidator(validator);
+    console.log("✓ Validator registered successfully");
+  } catch (error) {
+    console.error("✗ Validator registration failed:", error);
+  }
+}
+
+// Test required methods validation
+function testInterfaceValidation() {
+  // Missing required method should fail
+  const invalidProcessor = {
+    // Missing processingStage() method
+    process: (result) => result,
+  };
+
+  try {
+    registerPostProcessor(invalidProcessor);
+    console.error("✗ Should have rejected processor with missing methods");
+  } catch (error) {
+    console.log("✓ Correctly rejected invalid processor:", error);
+  }
+}
+
+// Run tests
+testPostProcessorRegistration();
+testValidatorRegistration();
+testInterfaceValidation();
+```
+
+Validate plugin implementations before deploying to production.
--- a/docs/snippets/wasm/plugins/plugin_validator.md
+++ b/docs/snippets/wasm/plugins/plugin_validator.md
@@ -0,0 +1,69 @@
+# Register Custom Validator Plugin
+
+Register a custom validator that checks extraction results for quality or correctness.
+
+```typescript title="WASM"
+import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Define a custom validator
+const customValidator = {
+  validate: (extractionResult) => {
+    const text = extractionResult.text || "";
+
+    // Check for minimum content
+    if (text.length === 0) {
+      return {
+        valid: false,
+        error: "No text extracted from document",
+      };
+    }
+
+    // Check for suspicious patterns
+    const hasRepeatingChars = /(.)\1{5,}/.test(text);
+    if (hasRepeatingChars) {
+      return {
+        valid: false,
+        error: "Text contains excessive repeating characters (possible OCR error)",
+      };
+    }
+
+    // Check if text is mostly whitespace
+    if (text.trim().length < text.length * 0.5) {
+      return {
+        valid: false,
+        error: "Text is mostly whitespace",
+      };
+    }
+
+    return {
+      valid: true,
+      error: null,
+    };
+  },
+};
+
+try {
+  registerValidator(customValidator);
+  console.log("Custom validator registered");
+} catch (error) {
+  console.error("Failed to register validator:", error);
+}
+
+// Extract and validate
+async function extractAndValidate(fileBytes, mimeType) {
+  const result = await extractBytes(fileBytes, mimeType, {});
+
+  const validation = customValidator.validate(result);
+  if (!validation.valid) {
+    console.warn("Validation failed:", validation.error);
+  } else {
+    console.log("✓ Extraction passed validation");
+  }
+
+  return result;
+}
+```
+
+Validators run after extraction to ensure results meet quality standards.
--- a/docs/snippets/wasm/plugins/post-processor-custom.ts
+++ b/docs/snippets/wasm/plugins/post-processor-custom.ts
@@ -0,0 +1,37 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+class MarkdownFormatter {
+  async process(result: ExtractionResult): Promise<ExtractionResult> {
+    const formatted = result.content.replace(/^(.+)$/gm, "# $1").replace(/\n\n+/g, "\n\n");
+
+    return {
+      ...result,
+      content: formatted,
+    };
+  }
+
+  getName(): string {
+    return "markdown-formatter";
+  }
+
+  getVersion(): string {
+    return "1.0.0";
+  }
+}
+
+async function demonstrateCustomProcessor() {
+  await initWasm();
+
+  const processor = new MarkdownFormatter();
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  let result = await extractBytes(bytes, "application/pdf");
+
+  result = await processor.process(result);
+  console.log("Formatted result:", result.content);
+
+  return result;
+}
+
+demonstrateCustomProcessor().catch(console.error);
--- a/docs/snippets/wasm/plugins/quality_score_validator.md
+++ b/docs/snippets/wasm/plugins/quality_score_validator.md
@@ -0,0 +1,76 @@
+# Quality Score Validator
+
+Register a validator that computes and checks a quality score for extracted text.
+
+```typescript title="WASM"
+import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Define a quality score validator
+const qualityScoreValidator = {
+  validate: (extractionResult) => {
+    const text = extractionResult.text || "";
+    const metadata = extractionResult.metadata || {};
+
+    let score = 100;
+    const issues = [];
+
+    // Penalize empty text
+    if (text.length === 0) {
+      score -= 50;
+      issues.push("No text extracted");
+    }
+
+    // Penalize if mostly whitespace
+    const nonWhitespace = text.replace(/\s/g, "").length;
+    const whitespaceRatio = 1 - nonWhitespace / text.length;
+    if (whitespaceRatio > 0.5) {
+      score -= 20;
+      issues.push("High whitespace ratio");
+    }
+
+    // Penalize unusual character distributions
+    const unicodeRatio = (text.match(/[^\x00-\x7F]/g) || []).length / text.length;
+    if (unicodeRatio > 0.3) {
+      score -= 10;
+      issues.push("High Unicode character ratio");
+    }
+
+    // Check confidence if available
+    if (metadata.confidence && metadata.confidence < 0.5) {
+      score -= 15;
+      issues.push("Low confidence score");
+    }
+
+    const QUALITY_THRESHOLD = 60;
+    const isValid = score >= QUALITY_THRESHOLD;
+
+    return {
+      valid: isValid,
+      error: isValid ? null : `Quality score ${score} < ${QUALITY_THRESHOLD}: ${issues.join(", ")}`,
+      metadata: {
+        qualityScore: score,
+        issues: issues,
+      },
+    };
+  },
+};
+
+try {
+  registerValidator(qualityScoreValidator);
+  console.log("Quality score validator registered");
+} catch (error) {
+  console.error("Failed to register validator:", error);
+}
+
+// Extract with quality assessment
+const pdfBytes = new Uint8Array([
+  /* PDF content */
+]);
+const result = await extractBytes(pdfBytes, "application/pdf", {});
+const validation = qualityScoreValidator.validate(result);
+console.log("Quality assessment:", validation.metadata);
+```
+
+This validator assigns a quality score based on multiple text characteristics.
--- a/docs/snippets/wasm/plugins/stateful_plugin.md
+++ b/docs/snippets/wasm/plugins/stateful_plugin.md
@@ -0,0 +1,79 @@
+# Stateful Post-Processor Plugin
+
+Create a stateful post-processor that maintains state across multiple extraction calls.
+
+```typescript title="WASM"
+import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Create a stateful post-processor using a closure
+function createStatefulProcessor() {
+  const state = {
+    extractionCount: 0,
+    totalChars: 0,
+    lastResult: null,
+  };
+
+  return {
+    processingStage: () => "post-extraction",
+    process: (extractionResult) => {
+      // Update state
+      state.extractionCount++;
+      state.totalChars += extractionResult.text?.length || 0;
+      state.lastResult = extractionResult;
+
+      // Enrich result with statistics
+      const enriched = {
+        ...extractionResult,
+        metadata: {
+          ...extractionResult.metadata,
+          extractionIndex: state.extractionCount,
+          cumulativeChars: state.totalChars,
+          averageDocLength: Math.round(state.totalChars / state.extractionCount),
+        },
+      };
+
+      console.log(
+        `[Extraction ${state.extractionCount}] ${enriched.text?.length || 0} chars, cumulative: ${state.totalChars}`,
+      );
+
+      return enriched;
+    },
+
+    // Optional: expose state for inspection
+    getState: () => state,
+  };
+}
+
+// Register the stateful processor
+const statefulProcessor = createStatefulProcessor();
+registerPostProcessor(statefulProcessor);
+
+// Multiple extractions use the same state
+async function processMultipleDocs() {
+  const docs = [
+    new Uint8Array([
+      /* Doc 1 */
+    ]),
+    new Uint8Array([
+      /* Doc 2 */
+    ]),
+    new Uint8Array([
+      /* Doc 3 */
+    ]),
+  ];
+
+  const results = [];
+  for (const docBytes of docs) {
+    const result = await extractBytes(docBytes, "application/pdf", {});
+    results.push(result);
+  }
+
+  return results;
+}
+
+await processMultipleDocs();
+```
+
+Stateful processors can track metrics across multiple extractions or maintain context.
--- a/docs/snippets/wasm/plugins/unregister_plugins.md
+++ b/docs/snippets/wasm/plugins/unregister_plugins.md
@@ -0,0 +1,77 @@
+# Unregister Plugins
+
+Remove registered plugins from the WASM runtime using individual unregister or bulk clear operations.
+
+```typescript title="WASM"
+import init, {
+  registerDocumentExtractor,
+  unregisterDocumentExtractor,
+  listDocumentExtractors,
+  clearDocumentExtractors,
+  registerOcrBackend,
+  unregisterOcrBackend,
+  listOcrBackends,
+  clearOcrBackends,
+  registerPostProcessor,
+  unregisterPostProcessor,
+  listPostProcessors,
+  clearPostProcessors,
+  registerRenderer,
+  unregisterRenderer,
+  listRenderers,
+  clearRenderers,
+  registerValidator,
+  unregisterValidator,
+  listValidators,
+  clearValidators,
+} from "kreuzberg-wasm";
+
+await init();
+
+// Example: register a custom document extractor
+const extractor = {
+  extractBytes: async (bytes, mimeType, config) => {
+    return JSON.stringify({ text: "test", page_count: 1 });
+  },
+  supportedMimeTypes: () => JSON.stringify(["application/x-test"]),
+};
+
+registerDocumentExtractor(extractor);
+console.log("Registered extractors:", listDocumentExtractors());
+
+// Individual unregistration by plugin name
+try {
+  unregisterDocumentExtractor("wasm_bridge");
+  console.log("Extractor unregistered");
+} catch (error) {
+  console.error("Unregister failed:", error);
+}
+
+// Clear all plugins of a type
+clearPostProcessors();
+console.log("After clearPostProcessors:", listPostProcessors());
+
+clearOcrBackends();
+console.log("After clearOcrBackends:", listOcrBackends());
+
+clearRenderers();
+console.log("After clearRenderers:", listRenderers());
+
+clearValidators();
+console.log("After clearValidators:", listValidators());
+
+// Selective re-registration: clear and register only desired plugins
+clearPostProcessors();
+const myProcessor = {
+  processingStage: () => "post-extraction",
+  process: (result) => result, // Pass-through
+};
+registerPostProcessor(myProcessor);
+console.log("After selective re-register:", listPostProcessors());
+
+// Unregister specific plugin by name
+unregisterPostProcessor("wasm_bridge");
+console.log("After selective unregister:", listPostProcessors());
+```
+
+Use `unregister*` to remove individual plugins by name, or `clear*` for bulk removal of all plugins of a type. All custom plugins are registered with the default name `"wasm_bridge"` managed by the bridge.
--- a/docs/snippets/wasm/plugins/validator-custom.ts
+++ b/docs/snippets/wasm/plugins/validator-custom.ts
@@ -0,0 +1,68 @@
+import type { ExtractionResult } from "@kreuzberg/wasm";
+import { extractBytes, initWasm } from "@kreuzberg/wasm";
+
+interface ValidationError {
+  field: string;
+  message: string;
+}
+
+class ContentValidator {
+  private minContentLength: number = 10;
+  private maxContentLength: number = 10000000;
+
+  setMinLength(length: number) {
+    this.minContentLength = length;
+  }
+
+  validate(result: ExtractionResult): ValidationError[] {
+    const errors: ValidationError[] = [];
+
+    if (result.content.length < this.minContentLength) {
+      errors.push({
+        field: "content",
+        message: `Content length (${result.content.length}) is below minimum (${this.minContentLength})`,
+      });
+    }
+
+    if (result.content.length > this.maxContentLength) {
+      errors.push({
+        field: "content",
+        message: `Content length (${result.content.length}) exceeds maximum (${this.maxContentLength})`,
+      });
+    }
+
+    if (!result.mimeType) {
+      errors.push({
+        field: "mimeType",
+        message: "MIME type is required",
+      });
+    }
+
+    return errors;
+  }
+
+  getName(): string {
+    return "content-validator";
+  }
+}
+
+async function demonstrateValidator() {
+  await initWasm();
+
+  const validator = new ContentValidator();
+  validator.setMinLength(100);
+
+  const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
+
+  const result = await extractBytes(bytes, "application/pdf");
+  const errors = validator.validate(result);
+
+  if (errors.length > 0) {
+    console.log("Validation errors:");
+    errors.forEach((e) => console.log(`  ${e.field}: ${e.message}`));
+  } else {
+    console.log("Content validation passed");
+  }
+}
+
+demonstrateValidator().catch(console.error);
--- a/docs/snippets/wasm/plugins/word_count_processor.md
+++ b/docs/snippets/wasm/plugins/word_count_processor.md
@@ -0,0 +1,84 @@
+# Word Count Post-Processor
+
+Register a post-processor that computes word count and other text statistics.
+
+```typescript title="WASM"
+import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
+
+await init();
+
+// Define a word count post-processor
+const wordCountProcessor = {
+  processingStage: () => "post-extraction",
+  process: (extractionResult) => {
+    const text = extractionResult.text || "";
+
+    // Compute statistics
+    const words = text
+      .trim()
+      .split(/\s+/)
+      .filter((w) => w.length > 0);
+    const lines = text.split(/\n/).filter((l) => l.trim().length > 0);
+    const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim().length > 0);
+    const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
+
+    // Calculate reading time (average 200 words per minute)
+    const readingTimeMinutes = Math.ceil(words.length / 200);
+
+    // Compute character statistics
+    const chars = text.length;
+    const charsNoSpaces = text.replace(/\s/g, "").length;
+
+    // Enrich metadata with text statistics
+    const enriched = {
+      ...extractionResult,
+      metadata: {
+        ...extractionResult.metadata,
+        statistics: {
+          wordCount: words.length,
+          lineCount: lines.length,
+          paragraphCount: paragraphs.length,
+          sentenceCount: sentences.length,
+          charCount: chars,
+          charsNoSpaces: charsNoSpaces,
+          averageWordLength: words.length > 0 ? Math.round(charsNoSpaces / words.length) : 0,
+          averageLineLength: lines.length > 0 ? Math.round(words.length / lines.length) : 0,
+          readingTimeMinutes: readingTimeMinutes,
+        },
+      },
+    };
+
+    return enriched;
+  },
+};
+
+try {
+  registerPostProcessor(wordCountProcessor);
+  console.log("Word count post-processor registered");
+} catch (error) {
+  console.error("Failed to register post-processor:", error);
+}
+
+// Extract with word counting
+async function extractAndAnalyze(fileBytes, mimeType) {
+  const result = await extractBytes(fileBytes, mimeType, {});
+  const stats = result.metadata?.statistics;
+
+  console.log("Text Analysis:", {
+    words: stats?.wordCount,
+    lines: stats?.lineCount,
+    paragraphs: stats?.paragraphCount,
+    sentences: stats?.sentenceCount,
+    readingTime: `${stats?.readingTimeMinutes} min`,
+  });
+
+  return result;
+}
+
+const pdfBytes = new Uint8Array([
+  /* PDF content */
+]);
+await extractAndAnalyze(pdfBytes, "application/pdf");
+```
+
+This processor analyzes text and provides readability metrics.