This commit is contained in:
23
docs/snippets/wasm/plugins/clear_plugins.md
Normal file
23
docs/snippets/wasm/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Clear All Registered Plugins
|
||||
|
||||
Clear all registered OCR backends, post-processors, or validators from the global registry.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { clearOcrBackends, clearPostProcessors, clearValidators } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Clear all OCR backends
|
||||
clearOcrBackends();
|
||||
console.log("OCR backends cleared");
|
||||
|
||||
// Clear all post-processors
|
||||
clearPostProcessors();
|
||||
console.log("Post-processors cleared");
|
||||
|
||||
// Clear all validators
|
||||
clearValidators();
|
||||
console.log("Validators cleared");
|
||||
```
|
||||
|
||||
Use when you need to reset the plugin registries to their initial state or remove all custom plugins.
|
||||
34
docs/snippets/wasm/plugins/embedding_backend.md
Normal file
34
docs/snippets/wasm/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Register Custom Embedding Backend
|
||||
|
||||
Register a custom embedding backend that provides vector embeddings for text.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerEmbeddingBackend } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a custom embedding backend
|
||||
const customEmbedding = {
|
||||
dimensions: () => 384,
|
||||
embed: (texts) => {
|
||||
// Return embeddings for each text
|
||||
return texts.map((text) => {
|
||||
// Generate a dummy 384-dimensional vector
|
||||
const vector = new Array(384).fill(0).map((_, i) => Math.sin((text.charCodeAt(0) + i) / 384));
|
||||
return vector;
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerEmbeddingBackend(customEmbedding);
|
||||
console.log("Custom embedding backend registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register embedding backend:", error);
|
||||
}
|
||||
```
|
||||
|
||||
The embedding backend must implement:
|
||||
|
||||
- `dimensions()`: Returns the dimensionality of the embeddings
|
||||
- `embed(texts: string[])`: Computes vector embeddings for the given texts
|
||||
67
docs/snippets/wasm/plugins/extractor_registration.md
Normal file
67
docs/snippets/wasm/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# Document Extractor Registration
|
||||
|
||||
Register a custom document extractor plugin in WASM that implements the required interface.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, {
|
||||
registerDocumentExtractor,
|
||||
unregisterDocumentExtractor,
|
||||
listDocumentExtractors,
|
||||
extractBytes,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a custom extractor as a plain JS object with required methods
|
||||
const customExtractor = {
|
||||
// Required: extract document bytes
|
||||
// Takes: (bytes: Uint8Array, mimeType: string, config: object) -> Promise<{text: string, ...}>
|
||||
extractBytes: async (bytes, mimeType, config) => {
|
||||
if (mimeType !== "application/x-custom") {
|
||||
throw new Error("Unsupported MIME type");
|
||||
}
|
||||
// Custom extraction logic
|
||||
const text = new TextDecoder().decode(bytes);
|
||||
return JSON.stringify({
|
||||
text: `Extracted: ${text.slice(0, 100)}`,
|
||||
page_count: 1,
|
||||
language: "en",
|
||||
});
|
||||
},
|
||||
|
||||
// Required: list supported MIME types as JSON array
|
||||
supportedMimeTypes: () => {
|
||||
return JSON.stringify(["application/x-custom"]);
|
||||
},
|
||||
|
||||
// Optional: plugin name (returned by Plugin trait)
|
||||
version: () => "1.0.0",
|
||||
};
|
||||
|
||||
// Register the custom extractor
|
||||
try {
|
||||
registerDocumentExtractor(customExtractor);
|
||||
console.log("Extractor registered successfully");
|
||||
} catch (error) {
|
||||
console.error("Failed to register extractor:", error);
|
||||
}
|
||||
|
||||
// List all extractors (includes your custom one)
|
||||
const extractors = listDocumentExtractors();
|
||||
console.log("Available extractors:", extractors);
|
||||
|
||||
// Use the custom extractor via normal extraction
|
||||
const customBytes = new Uint8Array([0x00, 0x01, 0x02]);
|
||||
const result = await extractBytes(customBytes, "application/x-custom", {});
|
||||
console.log("Extraction result:", result);
|
||||
|
||||
// Unregister when done
|
||||
try {
|
||||
unregisterDocumentExtractor("wasm_bridge");
|
||||
console.log("Extractor unregistered");
|
||||
} catch (error) {
|
||||
console.error("Failed to unregister:", error);
|
||||
}
|
||||
```
|
||||
|
||||
The extractor object must implement `extractBytes` and `supportedMimeTypes` methods. Optional methods: `initialize()`, `shutdown()`, and `version()` for lifecycle management.
|
||||
39
docs/snippets/wasm/plugins/list_plugins.md
Normal file
39
docs/snippets/wasm/plugins/list_plugins.md
Normal file
@@ -0,0 +1,39 @@
|
||||
# List Registered Plugins
|
||||
|
||||
List all registered plugins of each type: OCR backends, post-processors, validators, and document extractors.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, {
|
||||
listDocumentExtractors,
|
||||
listOcrBackends,
|
||||
listPostProcessors,
|
||||
listValidators,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// List all document extractors
|
||||
const extractors = listDocumentExtractors();
|
||||
console.log("Document extractors:", extractors);
|
||||
|
||||
// List all OCR backends
|
||||
const ocrBackends = listOcrBackends();
|
||||
console.log("OCR backends:", ocrBackends);
|
||||
|
||||
// List all post-processors
|
||||
const processors = listPostProcessors();
|
||||
console.log("Post-processors:", processors);
|
||||
|
||||
// List all validators
|
||||
const validators = listValidators();
|
||||
console.log("Validators:", validators);
|
||||
|
||||
// Count registered plugins
|
||||
console.log(`Total plugins registered:
|
||||
Extractors: ${extractors.length}
|
||||
OCR backends: ${ocrBackends.length}
|
||||
Post-processors: ${processors.length}
|
||||
Validators: ${validators.length}`);
|
||||
```
|
||||
|
||||
Use this to verify which plugins are available before extraction or to debug plugin registration issues.
|
||||
51
docs/snippets/wasm/plugins/min_length_validator.md
Normal file
51
docs/snippets/wasm/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Minimum Length Text Validator
|
||||
|
||||
Register a validator that filters out extraction results with text below a minimum length threshold.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const MIN_LENGTH = 10;
|
||||
|
||||
// Define a minimum length validator
|
||||
const minLengthValidator = {
|
||||
validate: (extractionResult) => {
|
||||
const textLength = extractionResult.text?.length || 0;
|
||||
|
||||
if (textLength < MIN_LENGTH) {
|
||||
return {
|
||||
valid: false,
|
||||
error: `Text too short: ${textLength} < ${MIN_LENGTH}`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
valid: true,
|
||||
error: null,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerValidator(minLengthValidator);
|
||||
console.log(`Min length validator registered (threshold: ${MIN_LENGTH})`);
|
||||
} catch (error) {
|
||||
console.error("Failed to register validator:", error);
|
||||
}
|
||||
|
||||
// Now extract with validation enabled
|
||||
const pdfBytes = new Uint8Array([
|
||||
/* PDF content */
|
||||
]);
|
||||
const config = {
|
||||
ocr: null,
|
||||
chunking: null,
|
||||
};
|
||||
|
||||
const result = await extractBytes(pdfBytes, "application/pdf", config);
|
||||
console.log("Validated result:", result);
|
||||
```
|
||||
|
||||
This validator ensures extracted text meets minimum quality standards by checking length.
|
||||
57
docs/snippets/wasm/plugins/ocr-backend-custom.ts
Normal file
57
docs/snippets/wasm/plugins/ocr-backend-custom.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
import {
|
||||
initWasm,
|
||||
listOcrBackends,
|
||||
registerOcrBackend,
|
||||
unregisterOcrBackend,
|
||||
} from "@kreuzberg/wasm";
|
||||
|
||||
class CustomOcrBackend {
|
||||
private name: string = "custom-ocr";
|
||||
private enabled: boolean = true;
|
||||
|
||||
async initialize(): Promise<void> {
|
||||
console.log("Initializing custom OCR backend");
|
||||
}
|
||||
|
||||
async recognize(imageData: Uint8Array, language: string): Promise<string> {
|
||||
console.log(`Recognizing text in ${language} from ${imageData.byteLength} bytes`);
|
||||
return "Placeholder OCR result";
|
||||
}
|
||||
|
||||
getName(): string {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
isEnabled(): boolean {
|
||||
return this.enabled;
|
||||
}
|
||||
|
||||
setEnabled(enabled: boolean) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
async cleanup(): Promise<void> {
|
||||
console.log("Cleaning up custom OCR backend");
|
||||
}
|
||||
}
|
||||
|
||||
async function demonstrateCustomBackend() {
|
||||
await initWasm();
|
||||
|
||||
const backend = new CustomOcrBackend();
|
||||
await backend.initialize();
|
||||
|
||||
registerOcrBackend(backend);
|
||||
|
||||
const backends = listOcrBackends();
|
||||
console.log("Registered backends:", backends);
|
||||
|
||||
unregisterOcrBackend("custom-ocr");
|
||||
|
||||
const afterUnregister = listOcrBackends();
|
||||
console.log("Backends after unregister:", afterUnregister);
|
||||
|
||||
await backend.cleanup();
|
||||
}
|
||||
|
||||
demonstrateCustomBackend().catch(console.error);
|
||||
49
docs/snippets/wasm/plugins/pdf_metadata_extractor.md
Normal file
49
docs/snippets/wasm/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# PDF Metadata Post-Processor
|
||||
|
||||
Register a post-processor that extracts and enriches extraction results with PDF metadata.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a PDF metadata extractor post-processor
|
||||
const pdfMetadataProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (extractionResult) => {
|
||||
// Enrich extraction with metadata
|
||||
const enriched = {
|
||||
...extractionResult,
|
||||
metadata: {
|
||||
...extractionResult.metadata,
|
||||
processorName: "pdf-metadata",
|
||||
processedAt: new Date().toISOString(),
|
||||
wordCount: (extractionResult.text || "").split(/\s+/).length,
|
||||
},
|
||||
};
|
||||
|
||||
return enriched;
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerPostProcessor(pdfMetadataProcessor);
|
||||
console.log("PDF metadata post-processor registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register post-processor:", error);
|
||||
}
|
||||
|
||||
// Extract with post-processing
|
||||
const pdfBytes = new Uint8Array([
|
||||
/* PDF content */
|
||||
]);
|
||||
const config = {
|
||||
ocr: null,
|
||||
chunking: null,
|
||||
};
|
||||
|
||||
const result = await extractBytes(pdfBytes, "application/pdf", config);
|
||||
console.log("Enriched metadata:", result.metadata);
|
||||
```
|
||||
|
||||
The post-processor runs after extraction to enrich or transform the results.
|
||||
71
docs/snippets/wasm/plugins/pdf_only_processor.md
Normal file
71
docs/snippets/wasm/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# PDF-Only Post-Processor
|
||||
|
||||
Register a post-processor that only processes PDF documents and filters others.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a PDF-only post-processor
|
||||
const pdfOnlyProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (extractionResult) => {
|
||||
// Check if this is a PDF extraction
|
||||
const isPdf =
|
||||
extractionResult.metadata?.mimeType === "application/pdf" ||
|
||||
extractionResult.metadata?.source?.endsWith(".pdf");
|
||||
|
||||
if (!isPdf) {
|
||||
// Skip processing for non-PDF documents
|
||||
return extractionResult;
|
||||
}
|
||||
|
||||
// Apply PDF-specific processing
|
||||
const processed = {
|
||||
...extractionResult,
|
||||
metadata: {
|
||||
...extractionResult.metadata,
|
||||
pdfProcessed: true,
|
||||
pageCount: extractionResult.metadata?.pageCount || 1,
|
||||
},
|
||||
// Normalize text for PDFs
|
||||
text: (extractionResult.text || "")
|
||||
.replace(/\n{3,}/g, "\n\n") // Remove excessive line breaks
|
||||
.trim(),
|
||||
};
|
||||
|
||||
return processed;
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerPostProcessor(pdfOnlyProcessor);
|
||||
console.log("PDF-only post-processor registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register post-processor:", error);
|
||||
}
|
||||
|
||||
// Test with various documents
|
||||
const testDocs = [
|
||||
{
|
||||
bytes: new Uint8Array([
|
||||
/* PDF */
|
||||
]),
|
||||
type: "application/pdf",
|
||||
},
|
||||
{
|
||||
bytes: new Uint8Array([
|
||||
/* HTML */
|
||||
]),
|
||||
type: "text/html",
|
||||
},
|
||||
];
|
||||
|
||||
for (const doc of testDocs) {
|
||||
const result = await extractBytes(doc.bytes, doc.type, {});
|
||||
console.log(`${doc.type}: PDF-specific processing applied:`, result.metadata?.pdfProcessed);
|
||||
}
|
||||
```
|
||||
|
||||
This processor applies PDF-specific transformations only to PDF documents.
|
||||
66
docs/snippets/wasm/plugins/plugin-lifecycle.ts
Normal file
66
docs/snippets/wasm/plugins/plugin-lifecycle.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import { initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
|
||||
|
||||
class PluginManager {
|
||||
private plugins: Map<string, any> = new Map();
|
||||
|
||||
async registerPlugin(name: string, plugin: any): Promise<void> {
|
||||
console.log(`Registering plugin: ${name}`);
|
||||
|
||||
if (plugin.initialize) {
|
||||
await plugin.initialize();
|
||||
}
|
||||
|
||||
this.plugins.set(name, plugin);
|
||||
console.log(`Plugin ${name} registered successfully`);
|
||||
}
|
||||
|
||||
async unregisterPlugin(name: string): Promise<void> {
|
||||
const plugin = this.plugins.get(name);
|
||||
if (!plugin) {
|
||||
console.warn(`Plugin ${name} not found`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (plugin.cleanup) {
|
||||
await plugin.cleanup();
|
||||
}
|
||||
|
||||
this.plugins.delete(name);
|
||||
console.log(`Plugin ${name} unregistered`);
|
||||
}
|
||||
|
||||
listPlugins(): string[] {
|
||||
return Array.from(this.plugins.keys());
|
||||
}
|
||||
|
||||
async reloadPlugin(name: string): Promise<void> {
|
||||
const plugin = this.plugins.get(name);
|
||||
if (!plugin) {
|
||||
console.warn(`Plugin ${name} not found`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Reloading plugin: ${name}`);
|
||||
await this.unregisterPlugin(name);
|
||||
await this.registerPlugin(name, plugin);
|
||||
}
|
||||
}
|
||||
|
||||
async function demonstratePluginLifecycle() {
|
||||
await initWasm();
|
||||
|
||||
const manager = new PluginManager();
|
||||
|
||||
const backend = new TesseractWasmBackend();
|
||||
await manager.registerPlugin("tesseract", backend);
|
||||
|
||||
console.log("Active plugins:", manager.listPlugins());
|
||||
|
||||
await manager.reloadPlugin("tesseract");
|
||||
|
||||
await manager.unregisterPlugin("tesseract");
|
||||
|
||||
console.log("Active plugins:", manager.listPlugins());
|
||||
}
|
||||
|
||||
demonstratePluginLifecycle().catch(console.error);
|
||||
55
docs/snippets/wasm/plugins/plugin-pipeline.ts
Normal file
55
docs/snippets/wasm/plugins/plugin-pipeline.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface Plugin {
|
||||
name: string;
|
||||
execute: (result: ExtractionResult) => Promise<ExtractionResult>;
|
||||
}
|
||||
|
||||
class TextCleanerPlugin implements Plugin {
|
||||
name = "text-cleaner";
|
||||
|
||||
async execute(result: ExtractionResult): Promise<ExtractionResult> {
|
||||
const cleaned = result.content.replace(/\x00/g, "").replace(/\s+/g, " ").trim();
|
||||
|
||||
return { ...result, content: cleaned };
|
||||
}
|
||||
}
|
||||
|
||||
class MetadataEnricherPlugin implements Plugin {
|
||||
name = "metadata-enricher";
|
||||
|
||||
async execute(result: ExtractionResult): Promise<ExtractionResult> {
|
||||
return {
|
||||
...result,
|
||||
metadata: {
|
||||
...result.metadata,
|
||||
processedAt: new Date().toISOString(),
|
||||
contentLength: result.content.length,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function executePipeline(
|
||||
bytes: Uint8Array,
|
||||
mimeType: string,
|
||||
plugins: Plugin[],
|
||||
): Promise<ExtractionResult> {
|
||||
await initWasm();
|
||||
|
||||
let result = await extractBytes(bytes, mimeType);
|
||||
|
||||
for (const plugin of plugins) {
|
||||
console.log(`Executing plugin: ${plugin.name}`);
|
||||
result = await plugin.execute(result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const pipeline = [new TextCleanerPlugin(), new MetadataEnricherPlugin()];
|
||||
|
||||
executePipeline(new Uint8Array([1, 2, 3]), "application/pdf", pipeline)
|
||||
.then((r) => console.log("Pipeline complete", r))
|
||||
.catch(console.error);
|
||||
53
docs/snippets/wasm/plugins/plugin_extractor.md
Normal file
53
docs/snippets/wasm/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Custom Plugin Usage Pattern
|
||||
|
||||
Demonstrate the pattern for using registered plugins during document extraction.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes, registerPostProcessor } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Register a custom post-processor
|
||||
const customProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (result) => {
|
||||
console.log("Post-processor: enriching extraction result");
|
||||
return {
|
||||
...result,
|
||||
metadata: {
|
||||
...result.metadata,
|
||||
enriched: true,
|
||||
processorApplied: "customProcessor",
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
registerPostProcessor(customProcessor);
|
||||
|
||||
// Extract document with registered plugin
|
||||
async function extractWithPlugins(fileBytes, mimeType) {
|
||||
const config = {
|
||||
ocr: null,
|
||||
chunking: null,
|
||||
enableQualityProcessing: false,
|
||||
};
|
||||
|
||||
// Extraction automatically applies registered post-processors
|
||||
const result = await extractBytes(fileBytes, mimeType, config);
|
||||
|
||||
console.log("Extraction complete");
|
||||
console.log("Plugins applied:", result.metadata?.enriched);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Usage
|
||||
const pdfBytes = new Uint8Array([
|
||||
/* PDF content */
|
||||
]);
|
||||
const result = await extractWithPlugins(pdfBytes, "application/pdf");
|
||||
console.log("Final result:", result);
|
||||
```
|
||||
|
||||
The extraction pipeline automatically applies all registered plugins in the correct order.
|
||||
70
docs/snippets/wasm/plugins/plugin_logging.md
Normal file
70
docs/snippets/wasm/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# Plugin Logging and Debugging
|
||||
|
||||
Log plugin registration and execution for debugging purposes.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, {
|
||||
registerPostProcessor,
|
||||
registerValidator,
|
||||
registerOcrBackend,
|
||||
listPostProcessors,
|
||||
listValidators,
|
||||
listOcrBackends,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Track plugin registrations
|
||||
const pluginLog = {
|
||||
processors: [],
|
||||
validators: [],
|
||||
ocrBackends: [],
|
||||
};
|
||||
|
||||
// Register a logging post-processor
|
||||
const loggingProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (result) => {
|
||||
console.log("[POST-PROCESSOR] Processing extraction result", {
|
||||
textLength: result.text?.length,
|
||||
hasMetadata: !!result.metadata,
|
||||
});
|
||||
return result;
|
||||
},
|
||||
};
|
||||
|
||||
registerPostProcessor(loggingProcessor);
|
||||
pluginLog.processors.push("loggingProcessor");
|
||||
|
||||
// Register a logging validator
|
||||
const loggingValidator = {
|
||||
validate: (result) => {
|
||||
console.log("[VALIDATOR] Validating extraction result", {
|
||||
textLength: result.text?.length,
|
||||
isValid: true,
|
||||
});
|
||||
return { valid: true, error: null };
|
||||
},
|
||||
};
|
||||
|
||||
registerValidator(loggingValidator);
|
||||
pluginLog.validators.push("loggingValidator");
|
||||
|
||||
// Log registered plugins
|
||||
function logPluginStatus() {
|
||||
const processors = listPostProcessors();
|
||||
const validators = listValidators();
|
||||
const backends = listOcrBackends();
|
||||
|
||||
console.log("Plugin Registration Status:", {
|
||||
postProcessors: processors,
|
||||
validators: validators,
|
||||
ocrBackends: backends,
|
||||
total: processors.length + validators.length + backends.length,
|
||||
});
|
||||
}
|
||||
|
||||
logPluginStatus();
|
||||
```
|
||||
|
||||
Use this pattern to monitor and debug plugin lifecycle and execution.
|
||||
74
docs/snippets/wasm/plugins/plugin_testing.md
Normal file
74
docs/snippets/wasm/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Plugin Testing Pattern
|
||||
|
||||
Test custom plugins to verify they implement required interfaces correctly.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerValidator, registerPostProcessor } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Test fixture: sample extraction result
|
||||
const sampleResult = {
|
||||
text: "Sample extracted text from document",
|
||||
metadata: {
|
||||
mimeType: "application/pdf",
|
||||
source: "test.pdf",
|
||||
pageCount: 1,
|
||||
},
|
||||
};
|
||||
|
||||
// Test post-processor registration
|
||||
function testPostProcessorRegistration() {
|
||||
const processor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (result) => result,
|
||||
};
|
||||
|
||||
try {
|
||||
registerPostProcessor(processor);
|
||||
console.log("✓ Post-processor registered successfully");
|
||||
} catch (error) {
|
||||
console.error("✗ Post-processor registration failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Test validator registration
|
||||
function testValidatorRegistration() {
|
||||
const validator = {
|
||||
validate: (result) => ({
|
||||
valid: !!result.text,
|
||||
error: result.text ? null : "No text extracted",
|
||||
}),
|
||||
};
|
||||
|
||||
try {
|
||||
registerValidator(validator);
|
||||
console.log("✓ Validator registered successfully");
|
||||
} catch (error) {
|
||||
console.error("✗ Validator registration failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Test required methods validation
|
||||
function testInterfaceValidation() {
|
||||
// Missing required method should fail
|
||||
const invalidProcessor = {
|
||||
// Missing processingStage() method
|
||||
process: (result) => result,
|
||||
};
|
||||
|
||||
try {
|
||||
registerPostProcessor(invalidProcessor);
|
||||
console.error("✗ Should have rejected processor with missing methods");
|
||||
} catch (error) {
|
||||
console.log("✓ Correctly rejected invalid processor:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests
|
||||
testPostProcessorRegistration();
|
||||
testValidatorRegistration();
|
||||
testInterfaceValidation();
|
||||
```
|
||||
|
||||
Validate plugin implementations before deploying to production.
|
||||
69
docs/snippets/wasm/plugins/plugin_validator.md
Normal file
69
docs/snippets/wasm/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Register Custom Validator Plugin
|
||||
|
||||
Register a custom validator that checks extraction results for quality or correctness.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a custom validator
|
||||
const customValidator = {
|
||||
validate: (extractionResult) => {
|
||||
const text = extractionResult.text || "";
|
||||
|
||||
// Check for minimum content
|
||||
if (text.length === 0) {
|
||||
return {
|
||||
valid: false,
|
||||
error: "No text extracted from document",
|
||||
};
|
||||
}
|
||||
|
||||
// Check for suspicious patterns
|
||||
const hasRepeatingChars = /(.)\1{5,}/.test(text);
|
||||
if (hasRepeatingChars) {
|
||||
return {
|
||||
valid: false,
|
||||
error: "Text contains excessive repeating characters (possible OCR error)",
|
||||
};
|
||||
}
|
||||
|
||||
// Check if text is mostly whitespace
|
||||
if (text.trim().length < text.length * 0.5) {
|
||||
return {
|
||||
valid: false,
|
||||
error: "Text is mostly whitespace",
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
valid: true,
|
||||
error: null,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerValidator(customValidator);
|
||||
console.log("Custom validator registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register validator:", error);
|
||||
}
|
||||
|
||||
// Extract and validate
|
||||
async function extractAndValidate(fileBytes, mimeType) {
|
||||
const result = await extractBytes(fileBytes, mimeType, {});
|
||||
|
||||
const validation = customValidator.validate(result);
|
||||
if (!validation.valid) {
|
||||
console.warn("Validation failed:", validation.error);
|
||||
} else {
|
||||
console.log("✓ Extraction passed validation");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
Validators run after extraction to ensure results meet quality standards.
|
||||
37
docs/snippets/wasm/plugins/post-processor-custom.ts
Normal file
37
docs/snippets/wasm/plugins/post-processor-custom.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
class MarkdownFormatter {
|
||||
async process(result: ExtractionResult): Promise<ExtractionResult> {
|
||||
const formatted = result.content.replace(/^(.+)$/gm, "# $1").replace(/\n\n+/g, "\n\n");
|
||||
|
||||
return {
|
||||
...result,
|
||||
content: formatted,
|
||||
};
|
||||
}
|
||||
|
||||
getName(): string {
|
||||
return "markdown-formatter";
|
||||
}
|
||||
|
||||
getVersion(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
}
|
||||
|
||||
async function demonstrateCustomProcessor() {
|
||||
await initWasm();
|
||||
|
||||
const processor = new MarkdownFormatter();
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
let result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
result = await processor.process(result);
|
||||
console.log("Formatted result:", result.content);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
demonstrateCustomProcessor().catch(console.error);
|
||||
76
docs/snippets/wasm/plugins/quality_score_validator.md
Normal file
76
docs/snippets/wasm/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# Quality Score Validator
|
||||
|
||||
Register a validator that computes and checks a quality score for extracted text.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerValidator, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a quality score validator
|
||||
const qualityScoreValidator = {
|
||||
validate: (extractionResult) => {
|
||||
const text = extractionResult.text || "";
|
||||
const metadata = extractionResult.metadata || {};
|
||||
|
||||
let score = 100;
|
||||
const issues = [];
|
||||
|
||||
// Penalize empty text
|
||||
if (text.length === 0) {
|
||||
score -= 50;
|
||||
issues.push("No text extracted");
|
||||
}
|
||||
|
||||
// Penalize if mostly whitespace
|
||||
const nonWhitespace = text.replace(/\s/g, "").length;
|
||||
const whitespaceRatio = 1 - nonWhitespace / text.length;
|
||||
if (whitespaceRatio > 0.5) {
|
||||
score -= 20;
|
||||
issues.push("High whitespace ratio");
|
||||
}
|
||||
|
||||
// Penalize unusual character distributions
|
||||
const unicodeRatio = (text.match(/[^\x00-\x7F]/g) || []).length / text.length;
|
||||
if (unicodeRatio > 0.3) {
|
||||
score -= 10;
|
||||
issues.push("High Unicode character ratio");
|
||||
}
|
||||
|
||||
// Check confidence if available
|
||||
if (metadata.confidence && metadata.confidence < 0.5) {
|
||||
score -= 15;
|
||||
issues.push("Low confidence score");
|
||||
}
|
||||
|
||||
const QUALITY_THRESHOLD = 60;
|
||||
const isValid = score >= QUALITY_THRESHOLD;
|
||||
|
||||
return {
|
||||
valid: isValid,
|
||||
error: isValid ? null : `Quality score ${score} < ${QUALITY_THRESHOLD}: ${issues.join(", ")}`,
|
||||
metadata: {
|
||||
qualityScore: score,
|
||||
issues: issues,
|
||||
},
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerValidator(qualityScoreValidator);
|
||||
console.log("Quality score validator registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register validator:", error);
|
||||
}
|
||||
|
||||
// Extract with quality assessment
|
||||
const pdfBytes = new Uint8Array([
|
||||
/* PDF content */
|
||||
]);
|
||||
const result = await extractBytes(pdfBytes, "application/pdf", {});
|
||||
const validation = qualityScoreValidator.validate(result);
|
||||
console.log("Quality assessment:", validation.metadata);
|
||||
```
|
||||
|
||||
This validator assigns a quality score based on multiple text characteristics.
|
||||
79
docs/snippets/wasm/plugins/stateful_plugin.md
Normal file
79
docs/snippets/wasm/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Stateful Post-Processor Plugin
|
||||
|
||||
Create a stateful post-processor that maintains state across multiple extraction calls.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Create a stateful post-processor using a closure
|
||||
function createStatefulProcessor() {
|
||||
const state = {
|
||||
extractionCount: 0,
|
||||
totalChars: 0,
|
||||
lastResult: null,
|
||||
};
|
||||
|
||||
return {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (extractionResult) => {
|
||||
// Update state
|
||||
state.extractionCount++;
|
||||
state.totalChars += extractionResult.text?.length || 0;
|
||||
state.lastResult = extractionResult;
|
||||
|
||||
// Enrich result with statistics
|
||||
const enriched = {
|
||||
...extractionResult,
|
||||
metadata: {
|
||||
...extractionResult.metadata,
|
||||
extractionIndex: state.extractionCount,
|
||||
cumulativeChars: state.totalChars,
|
||||
averageDocLength: Math.round(state.totalChars / state.extractionCount),
|
||||
},
|
||||
};
|
||||
|
||||
console.log(
|
||||
`[Extraction ${state.extractionCount}] ${enriched.text?.length || 0} chars, cumulative: ${state.totalChars}`,
|
||||
);
|
||||
|
||||
return enriched;
|
||||
},
|
||||
|
||||
// Optional: expose state for inspection
|
||||
getState: () => state,
|
||||
};
|
||||
}
|
||||
|
||||
// Register the stateful processor
|
||||
const statefulProcessor = createStatefulProcessor();
|
||||
registerPostProcessor(statefulProcessor);
|
||||
|
||||
// Multiple extractions use the same state
|
||||
async function processMultipleDocs() {
|
||||
const docs = [
|
||||
new Uint8Array([
|
||||
/* Doc 1 */
|
||||
]),
|
||||
new Uint8Array([
|
||||
/* Doc 2 */
|
||||
]),
|
||||
new Uint8Array([
|
||||
/* Doc 3 */
|
||||
]),
|
||||
];
|
||||
|
||||
const results = [];
|
||||
for (const docBytes of docs) {
|
||||
const result = await extractBytes(docBytes, "application/pdf", {});
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
await processMultipleDocs();
|
||||
```
|
||||
|
||||
Stateful processors can track metrics across multiple extractions or maintain context.
|
||||
77
docs/snippets/wasm/plugins/unregister_plugins.md
Normal file
77
docs/snippets/wasm/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Unregister Plugins
|
||||
|
||||
Remove registered plugins from the WASM runtime using individual unregister or bulk clear operations.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, {
|
||||
registerDocumentExtractor,
|
||||
unregisterDocumentExtractor,
|
||||
listDocumentExtractors,
|
||||
clearDocumentExtractors,
|
||||
registerOcrBackend,
|
||||
unregisterOcrBackend,
|
||||
listOcrBackends,
|
||||
clearOcrBackends,
|
||||
registerPostProcessor,
|
||||
unregisterPostProcessor,
|
||||
listPostProcessors,
|
||||
clearPostProcessors,
|
||||
registerRenderer,
|
||||
unregisterRenderer,
|
||||
listRenderers,
|
||||
clearRenderers,
|
||||
registerValidator,
|
||||
unregisterValidator,
|
||||
listValidators,
|
||||
clearValidators,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Example: register a custom document extractor
|
||||
const extractor = {
|
||||
extractBytes: async (bytes, mimeType, config) => {
|
||||
return JSON.stringify({ text: "test", page_count: 1 });
|
||||
},
|
||||
supportedMimeTypes: () => JSON.stringify(["application/x-test"]),
|
||||
};
|
||||
|
||||
registerDocumentExtractor(extractor);
|
||||
console.log("Registered extractors:", listDocumentExtractors());
|
||||
|
||||
// Individual unregistration by plugin name
|
||||
try {
|
||||
unregisterDocumentExtractor("wasm_bridge");
|
||||
console.log("Extractor unregistered");
|
||||
} catch (error) {
|
||||
console.error("Unregister failed:", error);
|
||||
}
|
||||
|
||||
// Clear all plugins of a type
|
||||
clearPostProcessors();
|
||||
console.log("After clearPostProcessors:", listPostProcessors());
|
||||
|
||||
clearOcrBackends();
|
||||
console.log("After clearOcrBackends:", listOcrBackends());
|
||||
|
||||
clearRenderers();
|
||||
console.log("After clearRenderers:", listRenderers());
|
||||
|
||||
clearValidators();
|
||||
console.log("After clearValidators:", listValidators());
|
||||
|
||||
// Selective re-registration: clear and register only desired plugins
|
||||
clearPostProcessors();
|
||||
const myProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (result) => result, // Pass-through
|
||||
};
|
||||
registerPostProcessor(myProcessor);
|
||||
console.log("After selective re-register:", listPostProcessors());
|
||||
|
||||
// Unregister specific plugin by name
|
||||
unregisterPostProcessor("wasm_bridge");
|
||||
console.log("After selective unregister:", listPostProcessors());
|
||||
```
|
||||
|
||||
Use `unregister*` to remove individual plugins by name, or `clear*` for bulk removal of all plugins of a type. All custom plugins are registered with the default name `"wasm_bridge"` managed by the bridge.
|
||||
68
docs/snippets/wasm/plugins/validator-custom.ts
Normal file
68
docs/snippets/wasm/plugins/validator-custom.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface ValidationError {
|
||||
field: string;
|
||||
message: string;
|
||||
}
|
||||
|
||||
class ContentValidator {
|
||||
private minContentLength: number = 10;
|
||||
private maxContentLength: number = 10000000;
|
||||
|
||||
setMinLength(length: number) {
|
||||
this.minContentLength = length;
|
||||
}
|
||||
|
||||
validate(result: ExtractionResult): ValidationError[] {
|
||||
const errors: ValidationError[] = [];
|
||||
|
||||
if (result.content.length < this.minContentLength) {
|
||||
errors.push({
|
||||
field: "content",
|
||||
message: `Content length (${result.content.length}) is below minimum (${this.minContentLength})`,
|
||||
});
|
||||
}
|
||||
|
||||
if (result.content.length > this.maxContentLength) {
|
||||
errors.push({
|
||||
field: "content",
|
||||
message: `Content length (${result.content.length}) exceeds maximum (${this.maxContentLength})`,
|
||||
});
|
||||
}
|
||||
|
||||
if (!result.mimeType) {
|
||||
errors.push({
|
||||
field: "mimeType",
|
||||
message: "MIME type is required",
|
||||
});
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
getName(): string {
|
||||
return "content-validator";
|
||||
}
|
||||
}
|
||||
|
||||
async function demonstrateValidator() {
|
||||
await initWasm();
|
||||
|
||||
const validator = new ContentValidator();
|
||||
validator.setMinLength(100);
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
const errors = validator.validate(result);
|
||||
|
||||
if (errors.length > 0) {
|
||||
console.log("Validation errors:");
|
||||
errors.forEach((e) => console.log(` ${e.field}: ${e.message}`));
|
||||
} else {
|
||||
console.log("Content validation passed");
|
||||
}
|
||||
}
|
||||
|
||||
demonstrateValidator().catch(console.error);
|
||||
84
docs/snippets/wasm/plugins/word_count_processor.md
Normal file
84
docs/snippets/wasm/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Word Count Post-Processor
|
||||
|
||||
Register a post-processor that computes word count and other text statistics.
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { registerPostProcessor, extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Define a word count post-processor
|
||||
const wordCountProcessor = {
|
||||
processingStage: () => "post-extraction",
|
||||
process: (extractionResult) => {
|
||||
const text = extractionResult.text || "";
|
||||
|
||||
// Compute statistics
|
||||
const words = text
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
.filter((w) => w.length > 0);
|
||||
const lines = text.split(/\n/).filter((l) => l.trim().length > 0);
|
||||
const paragraphs = text.split(/\n{2,}/).filter((p) => p.trim().length > 0);
|
||||
const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
|
||||
|
||||
// Calculate reading time (average 200 words per minute)
|
||||
const readingTimeMinutes = Math.ceil(words.length / 200);
|
||||
|
||||
// Compute character statistics
|
||||
const chars = text.length;
|
||||
const charsNoSpaces = text.replace(/\s/g, "").length;
|
||||
|
||||
// Enrich metadata with text statistics
|
||||
const enriched = {
|
||||
...extractionResult,
|
||||
metadata: {
|
||||
...extractionResult.metadata,
|
||||
statistics: {
|
||||
wordCount: words.length,
|
||||
lineCount: lines.length,
|
||||
paragraphCount: paragraphs.length,
|
||||
sentenceCount: sentences.length,
|
||||
charCount: chars,
|
||||
charsNoSpaces: charsNoSpaces,
|
||||
averageWordLength: words.length > 0 ? Math.round(charsNoSpaces / words.length) : 0,
|
||||
averageLineLength: lines.length > 0 ? Math.round(words.length / lines.length) : 0,
|
||||
readingTimeMinutes: readingTimeMinutes,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
return enriched;
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
registerPostProcessor(wordCountProcessor);
|
||||
console.log("Word count post-processor registered");
|
||||
} catch (error) {
|
||||
console.error("Failed to register post-processor:", error);
|
||||
}
|
||||
|
||||
// Extract with word counting
|
||||
async function extractAndAnalyze(fileBytes, mimeType) {
|
||||
const result = await extractBytes(fileBytes, mimeType, {});
|
||||
const stats = result.metadata?.statistics;
|
||||
|
||||
console.log("Text Analysis:", {
|
||||
words: stats?.wordCount,
|
||||
lines: stats?.lineCount,
|
||||
paragraphs: stats?.paragraphCount,
|
||||
sentences: stats?.sentenceCount,
|
||||
readingTime: `${stats?.readingTimeMinutes} min`,
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const pdfBytes = new Uint8Array([
|
||||
/* PDF content */
|
||||
]);
|
||||
await extractAndAnalyze(pdfBytes, "application/pdf");
|
||||
```
|
||||
|
||||
This processor analyzes text and provides readability metrics.
|
||||
Reference in New Issue
Block a user