Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
# Clear All Registered Plugins
Clear all registered OCR backends, post-processors, or validators from the global registry.
```typescript title="WASM"
import init, { clearOcrBackends, clearPostProcessors, clearValidators } from "kreuzberg-wasm";
await init();
// Clear all OCR backends
clearOcrBackends();
console.log("OCR backends cleared");
// Clear all post-processors
clearPostProcessors();
console.log("Post-processors cleared");
// Clear all validators
clearValidators();
console.log("Validators cleared");
```
Use when you need to reset the plugin registries to their initial state or remove all custom plugins.

View File

@@ -0,0 +1,34 @@
# Register Custom Embedding Backend
Register a custom embedding backend that provides vector embeddings for text.
```typescript title="WASM"
import init, { registerEmbeddingBackend } from "kreuzberg-wasm";
await init();
// Define a custom embedding backend
const customEmbedding = {
dimensions: () => 384,
embed: (texts) => {
// Return embeddings for each text
return texts.map((text) => {
// Generate a dummy 384-dimensional vector
const vector = new Array(384).fill(0).map((_, i) => Math.sin((text.charCodeAt(0) + i) / 384));
return vector;
});
},
};
try {
registerEmbeddingBackend(customEmbedding);
console.log("Custom embedding backend registered");
} catch (error) {
console.error("Failed to register embedding backend:", error);
}
```
The embedding backend must implement:
- `dimensions()`: Returns the dimensionality of the embeddings
- `embed(texts: string[])`: Computes vector embeddings for the given texts

View File

@@ -0,0 +1,67 @@
# Document Extractor Registration
Register a custom document extractor plugin in WASM that implements the required interface.
```typescript title="WASM"
import init, {
registerDocumentExtractor,
unregisterDocumentExtractor,
listDocumentExtractors,
extractBytes,
} from "kreuzberg-wasm";
await init();
// Define a custom extractor as a plain JS object with required methods
const customExtractor = {
// Required: extract document bytes
// Takes: (bytes: Uint8Array, mimeType: string, config: object) -> Promise<{text: string, ...}>
extractBytes: async (bytes, mimeType, config) => {
if (mimeType !== "application/x-custom") {
throw new Error("Unsupported MIME type");
}
// Custom extraction logic
const text = new TextDecoder().decode(bytes);
return JSON.stringify({
text: `Extracted: ${text.slice(0, 100)}`,
page_count: 1,
language: "en",
});
},
// Required: list supported MIME types as JSON array
supportedMimeTypes: () => {
return JSON.stringify(["application/x-custom"]);
},
// Optional: plugin name (returned by Plugin trait)
version: () => "1.0.0",
};
// Register the custom extractor
try {
registerDocumentExtractor(customExtractor);
console.log("Extractor registered successfully");
} catch (error) {
console.error("Failed to register extractor:", error);
}
// List all extractors (includes your custom one)
const extractors = listDocumentExtractors();
console.log("Available extractors:", extractors);
// Use the custom extractor via normal extraction
const customBytes = new Uint8Array([0x00, 0x01, 0x02]);
const result = await extractBytes(customBytes, "application/x-custom", {});
console.log("Extraction result:", result);
// Unregister when done
try {
unregisterDocumentExtractor("wasm_bridge");
console.log("Extractor unregistered");
} catch (error) {
console.error("Failed to unregister:", error);
}
```
The extractor object must implement `extractBytes` and `supportedMimeTypes` methods. Optional methods: `initialize()`, `shutdown()`, and `version()` for lifecycle management.

View File

@@ -0,0 +1,39 @@
# List Registered Plugins
List all registered plugins of each type: OCR backends, post-processors, validators, and document extractors.
```typescript title="WASM"
import init, {
listDocumentExtractors,
listOcrBackends,
listPostProcessors,
listValidators,
} from "kreuzberg-wasm";
await init();
// List all document extractors
const extractors = listDocumentExtractors();
console.log("Document extractors:", extractors);
// List all OCR backends
const ocrBackends = listOcrBackends();
console.log("OCR backends:", ocrBackends);
// List all post-processors
const processors = listPostProcessors();
console.log("Post-processors:", processors);
// List all validators
const validators = listValidators();
console.log("Validators:", validators);
// Count registered plugins
console.log(`Total plugins registered:
Extractors: ${extractors.length}
OCR backends: ${ocrBackends.length}
Post-processors: ${processors.length}
Validators: ${validators.length}`);
```
Use this to verify which plugins are available before extraction or to debug plugin registration issues.

View File

@@ -0,0 +1,51 @@
# Minimum Length Text Validator
Register a validator that filters out extraction results with text below a minimum length threshold.
```typescript title="WASM"
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
await init();
const MIN_LENGTH = 10;
// Define a minimum length validator
const minLengthValidator = {
validate: (extractionResult) => {
const textLength = extractionResult.text?.length || 0;
if (textLength < MIN_LENGTH) {
return {
valid: false,
error: `Text too short: ${textLength} < ${MIN_LENGTH}`,
};
}
return {
valid: true,
error: null,
};
},
};
try {
registerValidator(minLengthValidator);
console.log(`Min length validator registered (threshold: ${MIN_LENGTH})`);
} catch (error) {
console.error("Failed to register validator:", error);
}
// Now extract with validation enabled
const pdfBytes = new Uint8Array([
/* PDF content */
]);
const config = {
ocr: null,
chunking: null,
};
const result = await extractBytes(pdfBytes, "application/pdf", config);
console.log("Validated result:", result);
```
This validator ensures extracted text meets minimum quality standards by checking length.

View File

@@ -0,0 +1,57 @@
import {
initWasm,
listOcrBackends,
registerOcrBackend,
unregisterOcrBackend,
} from "@kreuzberg/wasm";
class CustomOcrBackend {
private name: string = "custom-ocr";
private enabled: boolean = true;
async initialize(): Promise<void> {
console.log("Initializing custom OCR backend");
}
async recognize(imageData: Uint8Array, language: string): Promise<string> {
console.log(`Recognizing text in ${language} from ${imageData.byteLength} bytes`);
return "Placeholder OCR result";
}
getName(): string {
return this.name;
}
isEnabled(): boolean {
return this.enabled;
}
setEnabled(enabled: boolean) {
this.enabled = enabled;
}
async cleanup(): Promise<void> {
console.log("Cleaning up custom OCR backend");
}
}
async function demonstrateCustomBackend() {
await initWasm();
const backend = new CustomOcrBackend();
await backend.initialize();
registerOcrBackend(backend);
const backends = listOcrBackends();
console.log("Registered backends:", backends);
unregisterOcrBackend("custom-ocr");
const afterUnregister = listOcrBackends();
console.log("Backends after unregister:", afterUnregister);
await backend.cleanup();
}
demonstrateCustomBackend().catch(console.error);

View File

@@ -0,0 +1,49 @@
# PDF Metadata Post-Processor
Register a post-processor that extracts and enriches extraction results with PDF metadata.
```typescript title="WASM"
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
await init();
// Define a PDF metadata extractor post-processor
const pdfMetadataProcessor = {
processingStage: () => "post-extraction",
process: (extractionResult) => {
// Enrich extraction with metadata
const enriched = {
...extractionResult,
metadata: {
...extractionResult.metadata,
processorName: "pdf-metadata",
processedAt: new Date().toISOString(),
wordCount: (extractionResult.text || "").split(/\s+/).length,
},
};
return enriched;
},
};
try {
registerPostProcessor(pdfMetadataProcessor);
console.log("PDF metadata post-processor registered");
} catch (error) {
console.error("Failed to register post-processor:", error);
}
// Extract with post-processing
const pdfBytes = new Uint8Array([
/* PDF content */
]);
const config = {
ocr: null,
chunking: null,
};
const result = await extractBytes(pdfBytes, "application/pdf", config);
console.log("Enriched metadata:", result.metadata);
```
The post-processor runs after extraction to enrich or transform the results.

View File

@@ -0,0 +1,71 @@
# PDF-Only Post-Processor
Register a post-processor that only processes PDF documents and filters others.
```typescript title="WASM"
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
await init();
// Define a PDF-only post-processor
const pdfOnlyProcessor = {
processingStage: () => "post-extraction",
process: (extractionResult) => {
// Check if this is a PDF extraction
const isPdf =
extractionResult.metadata?.mimeType === "application/pdf" ||
extractionResult.metadata?.source?.endsWith(".pdf");
if (!isPdf) {
// Skip processing for non-PDF documents
return extractionResult;
}
// Apply PDF-specific processing
const processed = {
...extractionResult,
metadata: {
...extractionResult.metadata,
pdfProcessed: true,
pageCount: extractionResult.metadata?.pageCount || 1,
},
// Normalize text for PDFs
text: (extractionResult.text || "")
.replace(/\n{3,}/g, "\n\n") // Remove excessive line breaks
.trim(),
};
return processed;
},
};
try {
registerPostProcessor(pdfOnlyProcessor);
console.log("PDF-only post-processor registered");
} catch (error) {
console.error("Failed to register post-processor:", error);
}
// Test with various documents
const testDocs = [
{
bytes: new Uint8Array([
/* PDF */
]),
type: "application/pdf",
},
{
bytes: new Uint8Array([
/* HTML */
]),
type: "text/html",
},
];
for (const doc of testDocs) {
const result = await extractBytes(doc.bytes, doc.type, {});
console.log(`${doc.type}: PDF-specific processing applied:`, result.metadata?.pdfProcessed);
}
```
This processor applies PDF-specific transformations only to PDF documents.

View File

@@ -0,0 +1,66 @@
import { initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
class PluginManager {
private plugins: Map<string, any> = new Map();
async registerPlugin(name: string, plugin: any): Promise<void> {
console.log(`Registering plugin: ${name}`);
if (plugin.initialize) {
await plugin.initialize();
}
this.plugins.set(name, plugin);
console.log(`Plugin ${name} registered successfully`);
}
async unregisterPlugin(name: string): Promise<void> {
const plugin = this.plugins.get(name);
if (!plugin) {
console.warn(`Plugin ${name} not found`);
return;
}
if (plugin.cleanup) {
await plugin.cleanup();
}
this.plugins.delete(name);
console.log(`Plugin ${name} unregistered`);
}
listPlugins(): string[] {
return Array.from(this.plugins.keys());
}
async reloadPlugin(name: string): Promise<void> {
const plugin = this.plugins.get(name);
if (!plugin) {
console.warn(`Plugin ${name} not found`);
return;
}
console.log(`Reloading plugin: ${name}`);
await this.unregisterPlugin(name);
await this.registerPlugin(name, plugin);
}
}
async function demonstratePluginLifecycle() {
await initWasm();
const manager = new PluginManager();
const backend = new TesseractWasmBackend();
await manager.registerPlugin("tesseract", backend);
console.log("Active plugins:", manager.listPlugins());
await manager.reloadPlugin("tesseract");
await manager.unregisterPlugin("tesseract");
console.log("Active plugins:", manager.listPlugins());
}
demonstratePluginLifecycle().catch(console.error);

View File

@@ -0,0 +1,55 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface Plugin {
name: string;
execute: (result: ExtractionResult) => Promise<ExtractionResult>;
}
class TextCleanerPlugin implements Plugin {
name = "text-cleaner";
async execute(result: ExtractionResult): Promise<ExtractionResult> {
const cleaned = result.content.replace(/\x00/g, "").replace(/\s+/g, " ").trim();
return { ...result, content: cleaned };
}
}
class MetadataEnricherPlugin implements Plugin {
name = "metadata-enricher";
async execute(result: ExtractionResult): Promise<ExtractionResult> {
return {
...result,
metadata: {
...result.metadata,
processedAt: new Date().toISOString(),
contentLength: result.content.length,
},
};
}
}
async function executePipeline(
bytes: Uint8Array,
mimeType: string,
plugins: Plugin[],
): Promise<ExtractionResult> {
await initWasm();
let result = await extractBytes(bytes, mimeType);
for (const plugin of plugins) {
console.log(`Executing plugin: ${plugin.name}`);
result = await plugin.execute(result);
}
return result;
}
const pipeline = [new TextCleanerPlugin(), new MetadataEnricherPlugin()];
executePipeline(new Uint8Array([1, 2, 3]), "application/pdf", pipeline)
.then((r) => console.log("Pipeline complete", r))
.catch(console.error);

View File

@@ -0,0 +1,53 @@
# Custom Plugin Usage Pattern
Demonstrate the pattern for using registered plugins during document extraction.
```typescript title="WASM"
import init, { extractBytes, registerPostProcessor } from "kreuzberg-wasm";
await init();
// Register a custom post-processor
const customProcessor = {
processingStage: () => "post-extraction",
process: (result) => {
console.log("Post-processor: enriching extraction result");
return {
...result,
metadata: {
...result.metadata,
enriched: true,
processorApplied: "customProcessor",
},
};
},
};
registerPostProcessor(customProcessor);
// Extract document with registered plugin
async function extractWithPlugins(fileBytes, mimeType) {
const config = {
ocr: null,
chunking: null,
enableQualityProcessing: false,
};
// Extraction automatically applies registered post-processors
const result = await extractBytes(fileBytes, mimeType, config);
console.log("Extraction complete");
console.log("Plugins applied:", result.metadata?.enriched);
return result;
}
// Usage
const pdfBytes = new Uint8Array([
/* PDF content */
]);
const result = await extractWithPlugins(pdfBytes, "application/pdf");
console.log("Final result:", result);
```
The extraction pipeline automatically applies all registered plugins in the correct order.

View File

@@ -0,0 +1,70 @@
# Plugin Logging and Debugging
Log plugin registration and execution for debugging purposes.
```typescript title="WASM"
import init, {
registerPostProcessor,
registerValidator,
registerOcrBackend,
listPostProcessors,
listValidators,
listOcrBackends,
} from "kreuzberg-wasm";
await init();
// Track plugin registrations
const pluginLog = {
processors: [],
validators: [],
ocrBackends: [],
};
// Register a logging post-processor
const loggingProcessor = {
processingStage: () => "post-extraction",
process: (result) => {
console.log("[POST-PROCESSOR] Processing extraction result", {
textLength: result.text?.length,
hasMetadata: !!result.metadata,
});
return result;
},
};
registerPostProcessor(loggingProcessor);
pluginLog.processors.push("loggingProcessor");
// Register a logging validator
const loggingValidator = {
validate: (result) => {
console.log("[VALIDATOR] Validating extraction result", {
textLength: result.text?.length,
isValid: true,
});
return { valid: true, error: null };
},
};
registerValidator(loggingValidator);
pluginLog.validators.push("loggingValidator");
// Log registered plugins
function logPluginStatus() {
const processors = listPostProcessors();
const validators = listValidators();
const backends = listOcrBackends();
console.log("Plugin Registration Status:", {
postProcessors: processors,
validators: validators,
ocrBackends: backends,
total: processors.length + validators.length + backends.length,
});
}
logPluginStatus();
```
Use this pattern to monitor and debug plugin lifecycle and execution.

View File

@@ -0,0 +1,74 @@
# Plugin Testing Pattern
Test custom plugins to verify they implement required interfaces correctly.
```typescript title="WASM"
import init, { registerValidator, registerPostProcessor } from "kreuzberg-wasm";
await init();
// Test fixture: sample extraction result
const sampleResult = {
text: "Sample extracted text from document",
metadata: {
mimeType: "application/pdf",
source: "test.pdf",
pageCount: 1,
},
};
// Test post-processor registration
function testPostProcessorRegistration() {
const processor = {
processingStage: () => "post-extraction",
process: (result) => result,
};
try {
registerPostProcessor(processor);
console.log("✓ Post-processor registered successfully");
} catch (error) {
console.error("✗ Post-processor registration failed:", error);
}
}
// Test validator registration
function testValidatorRegistration() {
const validator = {
validate: (result) => ({
valid: !!result.text,
error: result.text ? null : "No text extracted",
}),
};
try {
registerValidator(validator);
console.log("✓ Validator registered successfully");
} catch (error) {
console.error("✗ Validator registration failed:", error);
}
}
// Test required methods validation
function testInterfaceValidation() {
// Missing required method should fail
const invalidProcessor = {
// Missing processingStage() method
process: (result) => result,
};
try {
registerPostProcessor(invalidProcessor);
console.error("✗ Should have rejected processor with missing methods");
} catch (error) {
console.log("✓ Correctly rejected invalid processor:", error);
}
}
// Run tests
testPostProcessorRegistration();
testValidatorRegistration();
testInterfaceValidation();
```
Validate plugin implementations before deploying to production.

View File

@@ -0,0 +1,69 @@
# Register Custom Validator Plugin
Register a custom validator that checks extraction results for quality or correctness.
```typescript title="WASM"
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
await init();
// Define a custom validator
const customValidator = {
validate: (extractionResult) => {
const text = extractionResult.text || "";
// Check for minimum content
if (text.length === 0) {
return {
valid: false,
error: "No text extracted from document",
};
}
// Check for suspicious patterns
const hasRepeatingChars = /(.)\1{5,}/.test(text);
if (hasRepeatingChars) {
return {
valid: false,
error: "Text contains excessive repeating characters (possible OCR error)",
};
}
// Check if text is mostly whitespace
if (text.trim().length < text.length * 0.5) {
return {
valid: false,
error: "Text is mostly whitespace",
};
}
return {
valid: true,
error: null,
};
},
};
try {
registerValidator(customValidator);
console.log("Custom validator registered");
} catch (error) {
console.error("Failed to register validator:", error);
}
// Extract and validate
async function extractAndValidate(fileBytes, mimeType) {
const result = await extractBytes(fileBytes, mimeType, {});
const validation = customValidator.validate(result);
if (!validation.valid) {
console.warn("Validation failed:", validation.error);
} else {
console.log("✓ Extraction passed validation");
}
return result;
}
```
Validators run after extraction to ensure results meet quality standards.

View File

@@ -0,0 +1,37 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
class MarkdownFormatter {
async process(result: ExtractionResult): Promise<ExtractionResult> {
const formatted = result.content.replace(/^(.+)$/gm, "# $1").replace(/\n\n+/g, "\n\n");
return {
...result,
content: formatted,
};
}
getName(): string {
return "markdown-formatter";
}
getVersion(): string {
return "1.0.0";
}
}
async function demonstrateCustomProcessor() {
await initWasm();
const processor = new MarkdownFormatter();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
let result = await extractBytes(bytes, "application/pdf");
result = await processor.process(result);
console.log("Formatted result:", result.content);
return result;
}
demonstrateCustomProcessor().catch(console.error);

View File

@@ -0,0 +1,76 @@
# Quality Score Validator
Register a validator that computes and checks a quality score for extracted text.
```typescript title="WASM"
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
await init();
// Define a quality score validator
const qualityScoreValidator = {
validate: (extractionResult) => {
const text = extractionResult.text || "";
const metadata = extractionResult.metadata || {};
let score = 100;
const issues = [];
// Penalize empty text
if (text.length === 0) {
score -= 50;
issues.push("No text extracted");
}
// Penalize if mostly whitespace
const nonWhitespace = text.replace(/\s/g, "").length;
const whitespaceRatio = 1 - nonWhitespace / text.length;
if (whitespaceRatio > 0.5) {
score -= 20;
issues.push("High whitespace ratio");
}
// Penalize unusual character distributions
const unicodeRatio = (text.match(/[^\x00-\x7F]/g) || []).length / text.length;
if (unicodeRatio > 0.3) {
score -= 10;
issues.push("High Unicode character ratio");
}
// Check confidence if available
if (metadata.confidence && metadata.confidence < 0.5) {
score -= 15;
issues.push("Low confidence score");
}
const QUALITY_THRESHOLD = 60;
const isValid = score >= QUALITY_THRESHOLD;
return {
valid: isValid,
error: isValid ? null : `Quality score ${score} < ${QUALITY_THRESHOLD}: ${issues.join(", ")}`,
metadata: {
qualityScore: score,
issues: issues,
},
};
},
};
try {
registerValidator(qualityScoreValidator);
console.log("Quality score validator registered");
} catch (error) {
console.error("Failed to register validator:", error);
}
// Extract with quality assessment
const pdfBytes = new Uint8Array([
/* PDF content */
]);
const result = await extractBytes(pdfBytes, "application/pdf", {});
const validation = qualityScoreValidator.validate(result);
console.log("Quality assessment:", validation.metadata);
```
This validator assigns a quality score based on multiple text characteristics.

View File

@@ -0,0 +1,79 @@
# Stateful Post-Processor Plugin
Create a stateful post-processor that maintains state across multiple extraction calls.
```typescript title="WASM"
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
await init();
// Create a stateful post-processor using a closure
function createStatefulProcessor() {
const state = {
extractionCount: 0,
totalChars: 0,
lastResult: null,
};
return {
processingStage: () => "post-extraction",
process: (extractionResult) => {
// Update state
state.extractionCount++;
state.totalChars += extractionResult.text?.length || 0;
state.lastResult = extractionResult;
// Enrich result with statistics
const enriched = {
...extractionResult,
metadata: {
...extractionResult.metadata,
extractionIndex: state.extractionCount,
cumulativeChars: state.totalChars,
averageDocLength: Math.round(state.totalChars / state.extractionCount),
},
};
console.log(
`[Extraction ${state.extractionCount}] ${enriched.text?.length || 0} chars, cumulative: ${state.totalChars}`,
);
return enriched;
},
// Optional: expose state for inspection
getState: () => state,
};
}
// Register the stateful processor
const statefulProcessor = createStatefulProcessor();
registerPostProcessor(statefulProcessor);
// Multiple extractions use the same state
async function processMultipleDocs() {
const docs = [
new Uint8Array([
/* Doc 1 */
]),
new Uint8Array([
/* Doc 2 */
]),
new Uint8Array([
/* Doc 3 */
]),
];
const results = [];
for (const docBytes of docs) {
const result = await extractBytes(docBytes, "application/pdf", {});
results.push(result);
}
return results;
}
await processMultipleDocs();
```
Stateful processors can track metrics across multiple extractions or maintain context.

View File

@@ -0,0 +1,77 @@
# Unregister Plugins
Remove registered plugins from the WASM runtime using individual unregister or bulk clear operations.
```typescript title="WASM"
import init, {
registerDocumentExtractor,
unregisterDocumentExtractor,
listDocumentExtractors,
clearDocumentExtractors,
registerOcrBackend,
unregisterOcrBackend,
listOcrBackends,
clearOcrBackends,
registerPostProcessor,
unregisterPostProcessor,
listPostProcessors,
clearPostProcessors,
registerRenderer,
unregisterRenderer,
listRenderers,
clearRenderers,
registerValidator,
unregisterValidator,
listValidators,
clearValidators,
} from "kreuzberg-wasm";
await init();
// Example: register a custom document extractor
const extractor = {
extractBytes: async (bytes, mimeType, config) => {
return JSON.stringify({ text: "test", page_count: 1 });
},
supportedMimeTypes: () => JSON.stringify(["application/x-test"]),
};
registerDocumentExtractor(extractor);
console.log("Registered extractors:", listDocumentExtractors());
// Individual unregistration by plugin name
try {
unregisterDocumentExtractor("wasm_bridge");
console.log("Extractor unregistered");
} catch (error) {
console.error("Unregister failed:", error);
}
// Clear all plugins of a type
clearPostProcessors();
console.log("After clearPostProcessors:", listPostProcessors());
clearOcrBackends();
console.log("After clearOcrBackends:", listOcrBackends());
clearRenderers();
console.log("After clearRenderers:", listRenderers());
clearValidators();
console.log("After clearValidators:", listValidators());
// Selective re-registration: clear and register only desired plugins
clearPostProcessors();
const myProcessor = {
processingStage: () => "post-extraction",
process: (result) => result, // Pass-through
};
registerPostProcessor(myProcessor);
console.log("After selective re-register:", listPostProcessors());
// Unregister specific plugin by name
unregisterPostProcessor("wasm_bridge");
console.log("After selective unregister:", listPostProcessors());
```
Use `unregister*` to remove individual plugins by name, or `clear*` for bulk removal of all plugins of a type. All custom plugins are registered with the default name `"wasm_bridge"` managed by the bridge.

View File

@@ -0,0 +1,68 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface ValidationError {
field: string;
message: string;
}
class ContentValidator {
private minContentLength: number = 10;
private maxContentLength: number = 10000000;
setMinLength(length: number) {
this.minContentLength = length;
}
validate(result: ExtractionResult): ValidationError[] {
const errors: ValidationError[] = [];
if (result.content.length < this.minContentLength) {
errors.push({
field: "content",
message: `Content length (${result.content.length}) is below minimum (${this.minContentLength})`,
});
}
if (result.content.length > this.maxContentLength) {
errors.push({
field: "content",
message: `Content length (${result.content.length}) exceeds maximum (${this.maxContentLength})`,
});
}
if (!result.mimeType) {
errors.push({
field: "mimeType",
message: "MIME type is required",
});
}
return errors;
}
getName(): string {
return "content-validator";
}
}
async function demonstrateValidator() {
await initWasm();
const validator = new ContentValidator();
validator.setMinLength(100);
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "application/pdf");
const errors = validator.validate(result);
if (errors.length > 0) {
console.log("Validation errors:");
errors.forEach((e) => console.log(` ${e.field}: ${e.message}`));
} else {
console.log("Content validation passed");
}
}
demonstrateValidator().catch(console.error);

View File

@@ -0,0 +1,84 @@
# Word Count Post-Processor
Register a post-processor that computes word count and other text statistics.
```typescript title="WASM"
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
await init();
// Define a word count post-processor
const wordCountProcessor = {
processingStage: () => "post-extraction",
process: (extractionResult) => {
const text = extractionResult.text || "";
// Compute statistics
const words = text
.trim()
.split(/\s+/)
.filter((w) => w.length > 0);
const lines = text.split(/\n/).filter((l) => l.trim().length > 0);
const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim().length > 0);
const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
// Calculate reading time (average 200 words per minute)
const readingTimeMinutes = Math.ceil(words.length / 200);
// Compute character statistics
const chars = text.length;
const charsNoSpaces = text.replace(/\s/g, "").length;
// Enrich metadata with text statistics
const enriched = {
...extractionResult,
metadata: {
...extractionResult.metadata,
statistics: {
wordCount: words.length,
lineCount: lines.length,
paragraphCount: paragraphs.length,
sentenceCount: sentences.length,
charCount: chars,
charsNoSpaces: charsNoSpaces,
averageWordLength: words.length > 0 ? Math.round(charsNoSpaces / words.length) : 0,
averageLineLength: lines.length > 0 ? Math.round(words.length / lines.length) : 0,
readingTimeMinutes: readingTimeMinutes,
},
},
};
return enriched;
},
};
try {
registerPostProcessor(wordCountProcessor);
console.log("Word count post-processor registered");
} catch (error) {
console.error("Failed to register post-processor:", error);
}
// Extract with word counting
async function extractAndAnalyze(fileBytes, mimeType) {
const result = await extractBytes(fileBytes, mimeType, {});
const stats = result.metadata?.statistics;
console.log("Text Analysis:", {
words: stats?.wordCount,
lines: stats?.lineCount,
paragraphs: stats?.paragraphCount,
sentences: stats?.sentenceCount,
readingTime: `${stats?.readingTimeMinutes} min`,
});
return result;
}
const pdfBytes = new Uint8Array([
/* PDF content */
]);
await extractAndAnalyze(pdfBytes, "application/pdf");
```
This processor analyzes text and provides readability metrics.