Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/packages/dart/lib/kreuzberg.dart
+++ b/packages/dart/lib/kreuzberg.dart
@@ -0,0 +1,3 @@
+// Generated by alef. Do not edit by hand.
+
+export 'src/kreuzberg.dart';
--- a/packages/dart/lib/src/LICENSE
+++ b/packages/dart/lib/src/LICENSE
@@ -0,0 +1,93 @@
+Elastic License 2.0 (ELv2)
+
+Copyright 2025-2026 Kreuzberg, Inc.
+
+Acceptance
+
+By using the software, you agree to all of the terms and conditions below.
+
+Copyright License
+
+The licensor grants you a non-exclusive, royalty-free, worldwide,
+non-sublicensable, non-transferable license to use, copy, distribute, make
+available, and prepare derivative works of the software, in each case subject to
+the limitations and conditions below.
+
+Limitations
+
+You may not provide the software to third parties as a hosted or managed
+service, where the service provides users with access to any substantial set of
+the features or functionality of the software.
+
+You may not move, change, disable, or circumvent the license key functionality
+in the software, and you may not remove or obscure any functionality in the
+software that is protected by the license key.
+
+You may not alter, remove, or obscure any licensing, copyright, or other notices
+of the licensor in the software. Any use of the licensor's trademarks is subject
+to applicable law.
+
+Patents
+
+The licensor grants you a license, under any patent claims the licensor can
+license, or becomes able to license, to make, have made, use, sell, offer for
+sale, import and have imported the software, in each case subject to the
+limitations and conditions in this license. This license does not cover any
+patent claims that you cause to be infringed by modifications or additions to the
+software. If you or your company make any written claim that the software
+infringes or contributes to infringement of any patent, your patent license for
+the software granted under these terms ends immediately. If your company makes
+such a claim, your patent license ends immediately for work on behalf of your
+company.
+
+Notices
+
+You must ensure that anyone who gets a copy of any part of the software from you
+also gets a copy of these terms.
+
+If you modify the software, you must include in any modified copies of the
+software prominent notices stating that you have modified the software.
+
+No Other Rights
+
+These terms do not imply any licenses other than those expressly granted in
+these terms.
+
+Termination
+
+If you use the software in violation of these terms, such use is not licensed,
+and your licenses will automatically terminate. If the licensor provides you with
+a notice of your violation, and you cease all violation of this license no later
+than 30 days after you receive that notice, your licenses will be reinstated
+retroactively. However, if you violate these terms after such reinstatement, any
+additional violation of these terms will cause your licenses to terminate
+automatically and permanently.
+
+No Liability
+
+As far as the law allows, the software comes as is, without any warranty or
+condition, and the licensor will not be liable to you for any damages arising out
+of these terms or the use or nature of the software, under any kind of legal
+claim.
+
+Definitions
+
+The licensor is the entity offering these terms, and the software is the
+software the licensor makes available under these terms, including any portion
+of it.
+
+you refers to the individual or entity agreeing to these terms.
+
+your company is any legal entity, sole proprietorship, or other kind of
+organization that you work for, plus all organizations that have control over,
+are under the control of, or are under common control with that organization.
+control means ownership of substantially all the assets of an entity, or the
+power to direct its management and policies by vote, contract, or otherwise.
+Control can be direct or indirect.
+
+your licenses are all the licenses granted to you for the software under these
+terms.
+
+use means anything you do with the software requiring one of your licenses.
+
+trademark means trademarks, service marks, and similar rights.
--- a/packages/dart/lib/src/kreuzberg.dart
+++ b/packages/dart/lib/src/kreuzberg.dart
@@ -0,0 +1,642 @@
+// Generated by alef. Do not edit by hand.
+
+import 'dart:typed_data';
+
+export 'kreuzberg_bridge_generated/lib.dart';
+export 'traits.dart';
+import 'kreuzberg_bridge_generated/lib.dart' as rust_bridge;
+// ignore: duplicate_import
+import 'kreuzberg_bridge_generated/lib.dart';
+
+class KreuzbergBridge {
+  /// Extract content from a byte array.
+  ///
+  /// This is the main entry point for in-memory extraction. It performs the following steps:
+  /// 1. Validate MIME type
+  /// 2. Handle legacy format conversion if needed
+  /// 3. Select appropriate extractor from registry
+  /// 4. Extract content
+  /// 5. Run post-processing pipeline
+  ///
+  /// # Arguments
+  ///
+  /// * `content` - The byte array to extract
+  /// * `mime_type` - MIME type of the content
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// An `ExtractionResult` containing the extracted content and metadata.
+  ///
+  /// # Errors
+  ///
+  /// Returns `KreuzbergError::Validation` if MIME type is invalid.
+  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::extract_bytes;
+  /// use kreuzberg::core::config::ExtractionConfig;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let bytes = b"Hello, world!";
+  /// let result = extract_bytes(bytes, "text/plain", &config).await?;
+  /// println!("Content: {}", result.content);
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<ExtractionResult> extractBytes(Uint8List content, String mimeType, [ExtractionConfig? config]) async {
+    return await rust_bridge.extractBytes(content: content, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Extract content from a file.
+  ///
+  /// This is the main entry point for file-based extraction. It performs the following steps:
+  /// 1. Check cache for existing result (if caching enabled)
+  /// 2. Detect or validate MIME type
+  /// 3. Select appropriate extractor from registry
+  /// 4. Extract content
+  /// 5. Run post-processing pipeline
+  /// 6. Store result in cache (if caching enabled)
+  ///
+  /// # Arguments
+  ///
+  /// * `path` - Path to the file to extract
+  /// * `mime_type` - Optional MIME type override. If None, will be auto-detected
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// An `ExtractionResult` containing the extracted content and metadata.
+  ///
+  /// # Errors
+  ///
+  /// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
+  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::extract_file;
+  /// use kreuzberg::core::config::ExtractionConfig;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let result = extract_file("document.pdf", None, &config).await?;
+  /// println!("Content: {}", result.content);
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<ExtractionResult> extractFile(String path, String? mimeType, [ExtractionConfig? config]) async {
+    return await rust_bridge.extractFile(path: path, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Synchronous wrapper for `extract_file`.
+  ///
+  /// This is a convenience function that blocks the current thread until extraction completes.
+  /// For async code, use `extract_file` directly.
+  ///
+  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
+  /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
+  ///
+  /// This function is only available with the `tokio-runtime` feature. For WASM targets,
+  /// use a truly synchronous extraction approach instead.
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::extract_file_sync;
+  /// use kreuzberg::core::config::ExtractionConfig;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let result = extract_file_sync("document.pdf", None, &config)?;
+  /// println!("Content: {}", result.content);
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<ExtractionResult> extractFileSync(String path, String? mimeType, [ExtractionConfig? config]) async {
+    return await rust_bridge.extractFileSync(path: path, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Synchronous wrapper for `extract_bytes`.
+  ///
+  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
+  /// a new runtime per call.
+  ///
+  /// With the `tokio-runtime` feature, this blocks the current thread using the global
+  /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::extract_bytes_sync;
+  /// use kreuzberg::core::config::ExtractionConfig;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let bytes = b"Hello, world!";
+  /// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
+  /// println!("Content: {}", result.content);
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<ExtractionResult> extractBytesSync(Uint8List content, String mimeType, [ExtractionConfig? config]) async {
+    return await rust_bridge.extractBytesSync(content: content, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: 0, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Synchronous wrapper for `batch_extract_files`.
+  ///
+  /// Uses the global Tokio runtime for optimal performance.
+  /// Only available with `tokio-runtime` (WASM has no filesystem).
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_files_sync;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchFileItem {
+  ///         path: "doc1.pdf".into(),
+  ///         config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
+  ///     },
+  ///     BatchFileItem { path: "doc2.pdf".into(), config: None },
+  /// ];
+  /// let results = batch_extract_files_sync(items, &config)?;
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<ExtractionResult>> batchExtractFilesSync(List<BatchFileItem> items, [ExtractionConfig? config]) async {
+    return await rust_bridge.batchExtractFilesSync(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Synchronous wrapper for `batch_extract_bytes`.
+  ///
+  /// Uses the global Tokio runtime for optimal performance.
+  /// With the `tokio-runtime` feature, this blocks the current thread using the global
+  /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
+  /// that iterates through items and calls `extract_bytes_sync()`.
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_bytes_sync;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
+  ///     BatchBytesItem {
+  ///         content: b"other".to_vec(),
+  ///         mime_type: "text/plain".to_string(),
+  ///         config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
+  ///     },
+  /// ];
+  /// let results = batch_extract_bytes_sync(items, &config)?;
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<ExtractionResult>> batchExtractBytesSync(List<BatchBytesItem> items, [ExtractionConfig? config]) async {
+    return await rust_bridge.batchExtractBytesSync(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Extract content from multiple files concurrently.
+  ///
+  /// This function processes multiple files in parallel, automatically managing
+  /// concurrency to prevent resource exhaustion. The concurrency limit can be
+  /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
+  /// to `(num_cpus * 1.5).ceil()`.
+  ///
+  /// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
+  /// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
+  /// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
+  /// taken from the batch-level `config`.
+  ///
+  /// # Arguments
+  ///
+  /// * `items` - Vector of `BatchFileItem` structs, each containing a path and optional
+  ///   per-file configuration overrides.
+  /// * `config` - Batch-level extraction configuration (provides defaults and batch settings)
+  ///
+  /// # Returns
+  ///
+  /// A vector of `ExtractionResult` in the same order as the input items.
+  ///
+  /// # Errors
+  ///
+  /// Individual file errors are captured in the result metadata. System errors
+  /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
+  ///
+  /// # Examples
+  ///
+  /// Simple usage with no per-file overrides:
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_files;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem};
+  /// use std::path::PathBuf;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchFileItem { path: "doc1.pdf".into(), config: None },
+  ///     BatchFileItem { path: "doc2.pdf".into(), config: None },
+  /// ];
+  /// let results = batch_extract_files(items, &config).await?;
+  /// println!("Processed {} files", results.len());
+  /// ```
+  ///
+  /// Per-file configuration overrides:
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_files;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
+  /// use std::path::PathBuf;
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchFileItem {
+  ///         path: "scan.pdf".into(),
+  ///         config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
+  ///     },
+  ///     BatchFileItem { path: "notes.txt".into(), config: None },
+  /// ];
+  /// let results = batch_extract_files(items, &config).await?;
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<ExtractionResult>> batchExtractFiles(List<BatchFileItem> items, [ExtractionConfig? config]) async {
+    return await rust_bridge.batchExtractFiles(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Extract content from multiple byte arrays concurrently.
+  ///
+  /// This function processes multiple byte arrays in parallel, automatically managing
+  /// concurrency to prevent resource exhaustion. The concurrency limit can be
+  /// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
+  /// to `(num_cpus * 1.5).ceil()`.
+  ///
+  /// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
+  /// fields from the batch-level `config`. Pass `None` as the config to use
+  /// the batch-level defaults for that item.
+  ///
+  /// # Arguments
+  ///
+  /// * `items` - Vector of `BatchBytesItem` structs, each containing content bytes,
+  ///   MIME type, and optional per-item configuration overrides.
+  /// * `config` - Batch-level extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// A vector of `ExtractionResult` in the same order as the input items.
+  ///
+  /// # Examples
+  ///
+  /// Simple usage with no per-item overrides:
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_bytes;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem};
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
+  ///     BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
+  /// ];
+  /// let results = batch_extract_bytes(items, &config).await?;
+  /// println!("Processed {} items", results.len());
+  /// ```
+  ///
+  /// Per-item configuration overrides:
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::core::extractor::batch_extract_bytes;
+  /// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
+  ///
+  /// let config = ExtractionConfig::default();
+  /// let items = vec![
+  ///     BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
+  ///     BatchBytesItem {
+  ///         content: b"<html>test</html>".to_vec(),
+  ///         mime_type: "text/html".to_string(),
+  ///         config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
+  ///     },
+  /// ];
+  /// let results = batch_extract_bytes(items, &config).await?;
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<ExtractionResult>> batchExtractBytes(List<BatchBytesItem> items, [ExtractionConfig? config]) async {
+    return await rust_bridge.batchExtractBytes(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
+  }
+
+  /// Detect MIME type from raw file bytes.
+  ///
+  /// Uses magic byte signatures to detect file type from content.
+  /// Falls back to `infer` crate for comprehensive detection.
+  ///
+  /// For ZIP-based files, inspects contents to distinguish Office Open XML
+  /// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
+  ///
+  /// # Arguments
+  ///
+  /// * `content` - Raw file bytes
+  ///
+  /// # Returns
+  ///
+  /// The detected MIME type string.
+  ///
+  /// # Errors
+  ///
+  /// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
+  /// throws anyhow::Error on failure
+  static Future<String> detectMimeTypeFromBytes(Uint8List content) async {
+    return await rust_bridge.detectMimeTypeFromBytes(content: content);
+  }
+
+  /// Get file extensions for a given MIME type.
+  ///
+  /// Returns all known file extensions that map to the specified MIME type.
+  ///
+  /// # Arguments
+  ///
+  /// * `mime_type` - The MIME type to look up
+  ///
+  /// # Returns
+  ///
+  /// A vector of file extensions (without leading dot) for the MIME type.
+  ///
+  /// # Example
+  ///
+  /// ```
+  /// use kreuzberg::core::mime::get_extensions_for_mime;
+  ///
+  /// let extensions = get_extensions_for_mime("application/pdf").unwrap();
+  /// assert_eq!(extensions, vec!["pdf"]);
+  ///
+  /// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document").unwrap();
+  /// assert!(doc_extensions.contains(&"docx".to_string()));
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<String>> getExtensionsForMime(String mimeType) async {
+    return await rust_bridge.getExtensionsForMime(mimeType: mimeType);
+  }
+
+  /// List the names of all registered embedding backends.
+  ///
+  /// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
+  /// bindings.
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listEmbeddingBackends() async {
+    return await rust_bridge.listEmbeddingBackends();
+  }
+
+  /// List names of all registered document extractors.
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listDocumentExtractors() async {
+    return await rust_bridge.listDocumentExtractors();
+  }
+
+  /// List all registered OCR backends.
+  ///
+  /// Returns the names of all OCR backends currently registered in the global registry.
+  ///
+  /// # Returns
+  ///
+  /// A vector of OCR backend names.
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// use kreuzberg::plugins::list_ocr_backends;
+  ///
+  /// let backends = list_ocr_backends()?;
+  /// for name in backends {
+  ///     println!("Registered OCR backend: {}", name);
+  /// }
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listOcrBackends() async {
+    return await rust_bridge.listOcrBackends();
+  }
+
+  /// List all registered post-processor names.
+  ///
+  /// Returns a vector of all post-processor names currently registered in the
+  /// global registry.
+  ///
+  /// # Returns
+  ///
+  /// - `Ok(Vec<String>)` - Vector of post-processor names
+  /// - `Err(...)` if the registry lock is poisoned
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// use kreuzberg::plugins::list_post_processors;
+  ///
+  /// let processors = list_post_processors()?;
+  /// for name in processors {
+  ///     println!("Registered post-processor: {}", name);
+  /// }
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listPostProcessors() async {
+    return await rust_bridge.listPostProcessors();
+  }
+
+  /// List names of all registered renderers.
+  ///
+  /// # Errors
+  ///
+  /// Returns an error if the registry lock is poisoned.
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listRenderers() async {
+    return await rust_bridge.listRenderers();
+  }
+
+  /// List names of all registered validators.
+  /// throws anyhow::Error on failure
+  static Future<List<String>> listValidators() async {
+    return await rust_bridge.listValidators();
+  }
+
+  /// Compare two extraction results and return a structured diff.
+  ///
+  /// The comparison is purely structural — no I/O, no side effects. All fields
+  /// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
+  ///
+  /// # Arguments
+  ///
+  /// * `a` — the "before" extraction result
+  /// * `b` — the "after" extraction result
+  /// * `opts` — controls which sections are compared and optional truncation
+  ///
+  /// # Example
+  ///
+  /// ```rust,no_run
+  /// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
+  ///
+  /// let mut a = ExtractionResult::default();
+  /// let mut b = ExtractionResult::default();
+  /// a.content = "Hello world".to_string();
+  /// b.content = "Hello Rust".to_string();
+  ///
+  /// let diff = compare(&a, &b, &DiffOptions::default());
+  /// assert_eq!(diff.content_diff.len(), 1);
+  /// ```
+  static Future<ExtractionDiff> compare(ExtractionResult a, ExtractionResult b, DiffOptions opts) async {
+    return await rust_bridge.compare(a: a, b: b, opts: opts);
+  }
+
+  /// Generate embeddings asynchronously for a list of text strings.
+  ///
+  /// This is the async counterpart to [`embed_texts`]. It offloads the blocking
+  /// ONNX inference work to a dedicated blocking thread pool via Tokio's
+  /// `spawn_blocking`, keeping the async executor free.
+  ///
+  /// Returns one embedding vector per input text in the same order.
+  ///
+  /// # Arguments
+  ///
+  /// * `texts` - Vec of strings to embed (owned, sent to blocking thread)
+  /// * `config` - Embedding configuration specifying model, batch size, and normalization
+  ///
+  /// # Errors
+  ///
+  /// - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
+  /// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
+  ///   or the blocking inference task panics
+  ///
+  /// # Example
+  ///
+  /// ```rust,ignore
+  /// use kreuzberg::{embed_texts_async, EmbeddingConfig};
+  ///
+  /// let embeddings = embed_texts_async(
+  ///     vec!["Hello!".to_string()],
+  ///     &EmbeddingConfig::default(),
+  /// ).await?;
+  /// ```
+  /// throws anyhow::Error on failure
+  static Future<List<Float64List>> embedTextsAsync(List<String> texts, EmbeddingConfig config) async {
+    return await rust_bridge.embedTextsAsync(texts: texts, config: config);
+  }
+
+  /// Render a single PDF page to PNG bytes.
+  ///
+  /// Returns raw PNG-encoded bytes for the specified page at the given DPI.
+  /// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
+  ///
+  /// # Arguments
+  ///
+  /// * `pdf_bytes` - Raw PDF file bytes
+  /// * `page_index` - Zero-based page index
+  /// * `dpi` - Resolution in dots per inch (default: 150)
+  /// * `password` - Optional password for encrypted PDFs
+  ///
+  /// # Errors
+  ///
+  /// Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
+  /// or rendered, or if `page_index` is out of range.
+  /// throws anyhow::Error on failure
+  static Future<Uint8List> renderPdfPageToPng(Uint8List pdfBytes, int pageIndex, {int? dpi, String? password}) async {
+    return await rust_bridge.renderPdfPageToPng(pdfBytes: pdfBytes, pageIndex: pageIndex, dpi: dpi, password: password);
+  }
+
+  /// Detect the MIME type of a file at the given path.
+  ///
+  /// Uses the file extension and optionally the file content to determine the MIME type.
+  /// Set `check_exists` to `true` to verify the file exists before detection.
+  /// throws anyhow::Error on failure
+  static Future<String> detectMimeType(String path, bool checkExists) async {
+    return await rust_bridge.detectMimeType(path: path, checkExists: checkExists);
+  }
+
+  /// Embed a list of texts using the configured embedding model.
+  ///
+  /// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
+  /// throws anyhow::Error on failure
+  static Future<List<Float64List>> embedTexts(List<String> texts, EmbeddingConfig config) async {
+    return await rust_bridge.embedTexts(texts: texts, config: config);
+  }
+
+  /// Get an embedding preset by name.
+  ///
+  /// Returns `None` if no preset with the given name exists. Returns an owned
+  /// clone so the value is safe to pass across FFI boundaries.
+  static Future<EmbeddingPreset?> getEmbeddingPreset(String name) async {
+    return await rust_bridge.getEmbeddingPreset(name: name);
+  }
+
+  /// List the names of all available embedding presets.
+  ///
+  /// Returns owned `String`s so the values are safe to pass across FFI boundaries.
+  static Future<List<String>> listEmbeddingPresets() async {
+    return await rust_bridge.listEmbeddingPresets();
+  }
+
+  /// Register a Dart implementation of [OcrBackend] with the plugin registry.
+  static Future<void> registerOcrBackend(OcrBackendDartImpl impl) async {
+    await rust_bridge.registerOcrBackend(impl: impl);
+  }
+  /// Unregister a previously-registered [OcrBackend] plugin by name.
+  static Future<void> unregisterOcrBackend(String name) async {
+    await rust_bridge.unregisterOcrBackend(name: name);
+  }
+  /// Clear all registered [OcrBackend] plugins from the registry.
+  static Future<void> clearOcrBackends() async {
+    await rust_bridge.clearOcrBackends();
+  }
+  /// Register a Dart implementation of [PostProcessor] with the plugin registry.
+  static Future<void> registerPostProcessor(PostProcessorDartImpl impl) async {
+    await rust_bridge.registerPostProcessor(impl: impl);
+  }
+  /// Unregister a previously-registered [PostProcessor] plugin by name.
+  static Future<void> unregisterPostProcessor(String name) async {
+    await rust_bridge.unregisterPostProcessor(name: name);
+  }
+  /// Clear all registered [PostProcessor] plugins from the registry.
+  static Future<void> clearPostProcessors() async {
+    await rust_bridge.clearPostProcessors();
+  }
+  /// Register a Dart implementation of [Validator] with the plugin registry.
+  static Future<void> registerValidator(ValidatorDartImpl impl) async {
+    await rust_bridge.registerValidator(impl: impl);
+  }
+  /// Unregister a previously-registered [Validator] plugin by name.
+  static Future<void> unregisterValidator(String name) async {
+    await rust_bridge.unregisterValidator(name: name);
+  }
+  /// Clear all registered [Validator] plugins from the registry.
+  static Future<void> clearValidators() async {
+    await rust_bridge.clearValidators();
+  }
+  /// Register a Dart implementation of [EmbeddingBackend] with the plugin registry.
+  static Future<void> registerEmbeddingBackend(EmbeddingBackendDartImpl impl) async {
+    await rust_bridge.registerEmbeddingBackend(impl: impl);
+  }
+  /// Unregister a previously-registered [EmbeddingBackend] plugin by name.
+  static Future<void> unregisterEmbeddingBackend(String name) async {
+    await rust_bridge.unregisterEmbeddingBackend(name: name);
+  }
+  /// Clear all registered [EmbeddingBackend] plugins from the registry.
+  static Future<void> clearEmbeddingBackends() async {
+    await rust_bridge.clearEmbeddingBackends();
+  }
+  /// Register a Dart implementation of [DocumentExtractor] with the plugin registry.
+  static Future<void> registerDocumentExtractor(DocumentExtractorDartImpl impl) async {
+    await rust_bridge.registerDocumentExtractor(impl: impl);
+  }
+  /// Unregister a previously-registered [DocumentExtractor] plugin by name.
+  static Future<void> unregisterDocumentExtractor(String name) async {
+    await rust_bridge.unregisterDocumentExtractor(name: name);
+  }
+  /// Clear all registered [DocumentExtractor] plugins from the registry.
+  static Future<void> clearDocumentExtractors() async {
+    await rust_bridge.clearDocumentExtractors();
+  }
+  /// Register a Dart implementation of [Renderer] with the plugin registry.
+  static Future<void> registerRenderer(RendererDartImpl impl) async {
+    await rust_bridge.registerRenderer(impl: impl);
+  }
+  /// Unregister a previously-registered [Renderer] plugin by name.
+  static Future<void> unregisterRenderer(String name) async {
+    await rust_bridge.unregisterRenderer(name: name);
+  }
+  /// Clear all registered [Renderer] plugins from the registry.
+  static Future<void> clearRenderers() async {
+    await rust_bridge.clearRenderers();
+  }
+}
--- a/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.dart
+++ b/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.dart
--- a/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.io.dart
+++ b/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.io.dart
--- a/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.web.dart
+++ b/packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.web.dart
--- a/packages/dart/lib/src/kreuzberg_bridge_generated/lib.dart
+++ b/packages/dart/lib/src/kreuzberg_bridge_generated/lib.dart
--- a/packages/dart/lib/src/kreuzberg_bridge_generated/lib.freezed.dart
+++ b/packages/dart/lib/src/kreuzberg_bridge_generated/lib.freezed.dart
--- a/packages/dart/lib/src/traits.dart
+++ b/packages/dart/lib/src/traits.dart
@@ -0,0 +1,646 @@
+// Generated by alef. Do not edit by hand.
+
+import 'kreuzberg_bridge_generated/lib.dart';
+import 'dart:typed_data';
+
+/// Abstract class for the `OcrBackend` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyOcrBackend implements OcrBackend {
+///   @override
+///   Future<ExtractionResult> processImage(...) async { ... }
+///   @override
+///   Future<ExtractionResult> processImageFile(...) async { ... }
+///   @override
+///   Future<bool> supportsLanguage(...) async { ... }
+///   @override
+///   Future<OcrBackendType> backendType(...) async { ... }
+///   @override
+///   Future<List<String>> supportedLanguages(...) async { ... }
+///   @override
+///   Future<bool> supportsTableDetection(...) async { ... }
+///   @override
+///   Future<bool> supportsDocumentProcessing(...) async { ... }
+///   @override
+///   Future<ExtractionResult> processDocument(...) async { ... }
+/// }
+///
+/// final impl = createOcrBackendDartImpl(
+///   processImage: (...) => myInstance.processImage(...),
+///   processImageFile: (...) => myInstance.processImageFile(...),
+///   supportsLanguage: (...) => myInstance.supportsLanguage(...),
+///   backendType: (...) => myInstance.backendType(...),
+///   supportedLanguages: (...) => myInstance.supportedLanguages(...),
+///   supportsTableDetection: (...) => myInstance.supportsTableDetection(...),
+///   supportsDocumentProcessing: (...) => myInstance.supportsDocumentProcessing(...),
+///   processDocument: (...) => myInstance.processDocument(...),
+/// );
+/// ```
+///
+abstract class OcrBackend {
+  /// Process an image and extract text via OCR.
+  ///
+  /// # Arguments
+  ///
+  /// * `image_bytes` - Raw image data (JPEG, PNG, TIFF, etc.)
+  /// * `config` - OCR configuration (language, PSM mode, etc.)
+  ///
+  /// # Returns
+  ///
+  /// An `ExtractionResult` containing the extracted text and metadata.
+  ///
+  /// # Errors
+  ///
+  /// - `KreuzbergError::Ocr` - OCR processing failed
+  /// - `KreuzbergError::Validation` - Invalid image format or configuration
+  /// - `KreuzbergError::Io` - I/O errors (these always bubble up)
+  ///
+  /// # Reading `backend_options`
+  ///
+  /// Backends that support runtime tuning can read `config.backend_options` and
+  /// deserialize only the keys they care about. Unknown keys are silently ignored,
+  /// so multiple backends can coexist in a pipeline without key conflicts.
+  ///
+  /// ```rust
+  /// async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
+  ///     // Read backend-specific options; unknown keys are silently ignored.
+  ///     let fast_mode = config.backend_options
+  ///         .as_ref()
+  ///         .and_then(|v| v.get("mode"))
+  ///         .and_then(|v| v.as_str())
+  ///         .map(|s| s == "fast")
+  ///         .unwrap_or(false);
+  ///
+  ///     if image_bytes.is_empty() {
+  ///         return Err(kreuzberg::KreuzbergError::Validation {
+  ///             message: "Empty image data".to_string(),
+  ///             source: None,
+  ///         });
+  ///     }
+  ///
+  ///     let text = if fast_mode {
+  ///         "Fast OCR result".to_string()
+  ///     } else {
+  ///         format!("Extracted text in language: {}", config.language)
+  ///     };
+  ///
+  ///     Ok(ExtractionResult {
+  ///         content: text,
+  ///         mime_type: Cow::Borrowed("text/plain"),
+  ///         ..Default::default()
+  ///     })
+  /// }
+  /// ```
+  /// throws anyhow::Error on failure
+  Future<ExtractionResult> processImage(Uint8List imageBytes, OcrConfig config);
+  /// Process a file and extract text via OCR.
+  ///
+  /// Default implementation reads the file and calls `process_image`.
+  /// Override for custom file handling or optimizations.
+  ///
+  /// # Arguments
+  ///
+  /// * `path` - Path to the image file
+  /// * `config` - OCR configuration
+  ///
+  /// # Errors
+  ///
+  /// Same as `process_image`, plus file I/O errors.
+  /// throws anyhow::Error on failure
+  Future<ExtractionResult> processImageFile(String path, OcrConfig config);
+  /// Check if this backend supports a given language code.
+  ///
+  /// # Arguments
+  ///
+  /// * `lang` - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
+  ///
+  /// # Returns
+  ///
+  /// `true` if the language is supported, `false` otherwise.
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// fn supports_language(&self, lang: &str) -> bool {
+  ///     self.languages.contains(&lang.to_string())
+  /// }
+  /// ```
+  Future<bool> supportsLanguage(String lang);
+  /// Get the backend type identifier.
+  ///
+  /// # Returns
+  ///
+  /// The backend type enum value.
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// fn backend_type(&self) -> OcrBackendType {
+  ///     OcrBackendType::Tesseract
+  /// }
+  /// ```
+  Future<OcrBackendType> backendType();
+  /// Optional: Get a list of all supported languages.
+  ///
+  /// Defaults to empty list. Override to provide comprehensive language support info.
+  Future<List<String>> supportedLanguages();
+  /// Optional: Check if the backend supports table detection.
+  ///
+  /// Defaults to `false`. Override if your backend can detect and extract tables.
+  Future<bool> supportsTableDetection();
+  /// Check if the backend supports direct document-level processing (e.g. for PDFs).
+  ///
+  /// Defaults to `false`. Override if the backend has optimized document processing.
+  Future<bool> supportsDocumentProcessing();
+  /// Process a document file directly via OCR.
+  ///
+  /// Only called if `supports_document_processing` returns `true`.
+  ///
+  /// # Arguments
+  ///
+  /// * `path` - Path to the document file (e.g. .pdf)
+  /// * `config` - OCR configuration
+  /// throws anyhow::Error on failure
+  Future<ExtractionResult> processDocument(String path, OcrConfig config);
+}
+
+/// Abstract class for the `PostProcessor` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyPostProcessor implements PostProcessor {
+///   @override
+///   Future<void> process(...) async { ... }
+///   @override
+///   Future<ProcessingStage> processingStage(...) async { ... }
+///   @override
+///   Future<bool> shouldProcess(...) async { ... }
+///   @override
+///   Future<int> estimatedDurationMs(...) async { ... }
+///   @override
+///   Future<int> priority(...) async { ... }
+/// }
+///
+/// final impl = createPostProcessorDartImpl(
+///   process: (...) => myInstance.process(...),
+///   processingStage: (...) => myInstance.processingStage(...),
+///   shouldProcess: (...) => myInstance.shouldProcess(...),
+///   estimatedDurationMs: (...) => myInstance.estimatedDurationMs(...),
+///   priority: (...) => myInstance.priority(...),
+/// );
+/// ```
+///
+abstract class PostProcessor {
+  /// Process an extraction result.
+  ///
+  /// Transform or enrich the extraction result. Can modify:
+  /// - `content` - The extracted text
+  /// - `metadata` - Add or update metadata fields
+  /// - `tables` - Modify or enhance table data
+  ///
+  /// # Arguments
+  ///
+  /// * `result` - Mutable reference to the extraction result to process
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
+  ///
+  /// # Errors
+  ///
+  /// Return errors for fatal processing failures. Non-fatal errors should be
+  /// captured in metadata directly on the result.
+  ///
+  /// # Performance
+  ///
+  /// This signature avoids unnecessary cloning of large extraction results by
+  /// taking a mutable reference instead of ownership. Processors modify the
+  /// result in place.
+  ///
+  /// # Example - Language Detection
+  ///
+  /// ```rust
+  /// async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
+  ///     -> Result<()> {
+  ///     // Detect language (simplified - use real detection library in practice)
+  ///     let language = "en"; // Placeholder detection
+  ///
+  ///     // Add to metadata
+  ///     result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
+  ///
+  ///     Ok(())
+  /// }
+  /// ```
+  ///
+  /// # Example - Text Cleaning
+  ///
+  /// ```rust
+  /// async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
+  ///     -> Result<()> {
+  ///     // Remove excessive whitespace
+  ///     result.content = result
+  ///         .content
+  ///         .split_whitespace()
+  ///         .collect::<Vec<_>>()
+  ///         .join(" ");
+  ///
+  ///     Ok(())
+  /// }
+  /// ```
+  /// throws anyhow::Error on failure
+  Future<void> process(ExtractionResult result, ExtractionConfig config);
+  /// Get the processing stage for this post-processor.
+  ///
+  /// Determines when this processor runs in the pipeline.
+  ///
+  /// # Returns
+  ///
+  /// The `ProcessingStage` (Early, Middle, or Late).
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// fn processing_stage(&self) -> ProcessingStage {
+  ///     ProcessingStage::Early  // Run before other processors
+  /// }
+  /// ```
+  Future<ProcessingStage> processingStage();
+  /// Optional: Check if this processor should run for a given result.
+  ///
+  /// Allows conditional processing based on MIME type, metadata, or content.
+  /// Defaults to `true` (always run).
+  ///
+  /// # Arguments
+  ///
+  /// * `result` - The extraction result to check
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// `true` if the processor should run, `false` to skip.
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// /// Only process PDF documents
+  /// fn should_process(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
+  ///     result.mime_type == "application/pdf"
+  /// }
+  /// ```
+  Future<bool> shouldProcess(ExtractionResult result, ExtractionConfig config);
+  /// Optional: Estimate processing time in milliseconds.
+  ///
+  /// Used for logging and debugging. Defaults to 0 (unknown).
+  ///
+  /// # Arguments
+  ///
+  /// * `result` - The extraction result to estimate for
+  ///
+  /// # Returns
+  ///
+  /// Estimated processing time in milliseconds.
+  Future<int> estimatedDurationMs(ExtractionResult result);
+  /// Execution priority within the processing stage.
+  ///
+  /// Higher values run first within the same `ProcessingStage`. Defaults to 50.
+  /// Use 0-49 for fallback processors, 50 for normal processors, and 51-255
+  /// for high-priority processors that should run early in their stage.
+  Future<int> priority();
+}
+
+/// Abstract class for the `Validator` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyValidator implements Validator {
+///   @override
+///   Future<void> validate(...) async { ... }
+///   @override
+///   Future<bool> shouldValidate(...) async { ... }
+///   @override
+///   Future<int> priority(...) async { ... }
+/// }
+///
+/// final impl = createValidatorDartImpl(
+///   validate: (...) => myInstance.validate(...),
+///   shouldValidate: (...) => myInstance.shouldValidate(...),
+///   priority: (...) => myInstance.priority(...),
+/// );
+/// ```
+///
+abstract class Validator {
+  /// Validate an extraction result.
+  ///
+  /// Check the extraction result and return `Ok(())` if valid, or an error
+  /// if validation fails.
+  ///
+  /// # Arguments
+  ///
+  /// * `result` - The extraction result to validate
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// - `Ok(())` if validation passes
+  /// - `Err(...)` if validation fails (extraction will fail)
+  ///
+  /// # Errors
+  ///
+  /// - `KreuzbergError::Validation` - Validation failed
+  /// - Any other error type appropriate for the failure
+  ///
+  /// # Example - Content Length Validation
+  ///
+  /// ```rust
+  /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
+  ///     -> Result<()> {
+  ///     let length = result.content.len();
+  ///
+  ///     if length < self.min {
+  ///         return Err(KreuzbergError::validation(format!(
+  ///             "Content too short: {} < {} characters",
+  ///             length, self.min
+  ///         )));
+  ///     }
+  ///
+  ///     if length > self.max {
+  ///         return Err(KreuzbergError::validation(format!(
+  ///             "Content too long: {} > {} characters",
+  ///             length, self.max
+  ///         )));
+  ///     }
+  ///
+  ///     Ok(())
+  /// }
+  /// ```
+  ///
+  /// # Example - Quality Score Validation
+  ///
+  /// ```rust
+  /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
+  ///     -> Result<()> {
+  ///     // Check if quality_score exists in metadata
+  ///     let score = result.metadata
+  ///         .additional
+  ///         .get("quality_score")
+  ///         .and_then(|v| v.as_f64())
+  ///         .unwrap_or(0.0);
+  ///
+  ///     if score < self.min_score {
+  ///         return Err(KreuzbergError::validation(format!(
+  ///             "Quality score too low: {} < {}",
+  ///             score, self.min_score
+  ///         )));
+  ///     }
+  ///
+  ///     Ok(())
+  /// }
+  /// ```
+  ///
+  /// # Example - Security Validation
+  ///
+  /// ```rust
+  /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
+  ///     -> Result<()> {
+  ///     // Check for blocked patterns
+  ///     for pattern in &self.blocked_patterns {
+  ///         if result.content.contains(pattern) {
+  ///             return Err(KreuzbergError::validation(format!(
+  ///                 "Content contains blocked pattern: {}",
+  ///                 pattern
+  ///             )));
+  ///         }
+  ///     }
+  ///
+  ///     Ok(())
+  /// }
+  /// ```
+  /// throws anyhow::Error on failure
+  Future<void> validate(ExtractionResult result, ExtractionConfig config);
+  /// Optional: Check if this validator should run for a given result.
+  ///
+  /// Allows conditional validation based on MIME type, metadata, or content.
+  /// Defaults to `true` (always run).
+  ///
+  /// # Arguments
+  ///
+  /// * `result` - The extraction result to check
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// `true` if the validator should run, `false` to skip.
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// /// Only validate PDF documents
+  /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
+  ///     result.mime_type == "application/pdf"
+  /// }
+  /// ```
+  Future<bool> shouldValidate(ExtractionResult result, ExtractionConfig config);
+  /// Optional: Get the validation priority.
+  ///
+  /// Higher priority validators run first. Useful for ordering validation checks
+  /// (e.g., run cheap validations before expensive ones).
+  ///
+  /// Default priority is 50.
+  ///
+  /// # Returns
+  ///
+  /// Priority value (higher = runs earlier).
+  ///
+  /// # Example
+  ///
+  /// ```rust
+  /// /// Run this validator first (it's fast)
+  /// fn priority(&self) -> i32 {
+  ///     100
+  /// }
+  /// ```
+  Future<int> priority();
+}
+
+/// Abstract class for the `EmbeddingBackend` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyEmbeddingBackend implements EmbeddingBackend {
+///   @override
+///   Future<int> dimensions(...) async { ... }
+///   @override
+///   Future<List<Float64List>> embed(...) async { ... }
+/// }
+///
+/// final impl = createEmbeddingBackendDartImpl(
+///   dimensions: (...) => myInstance.dimensions(...),
+///   embed: (...) => myInstance.embed(...),
+/// );
+/// ```
+///
+abstract class EmbeddingBackend {
+  /// Embedding vector dimension. Must be `> 0` and must match the length of
+  /// every vector returned by `embed`.
+  Future<int> dimensions();
+  /// Embed a batch of texts, returning one vector per input in order.
+  ///
+  /// # Errors
+  ///
+  /// Implementations should return `Plugin` for
+  /// backend-specific failures. The dispatcher layers its own validation
+  /// (length, per-vector dimension) on top.
+  /// throws anyhow::Error on failure
+  Future<List<Float64List>> embed(List<String> texts);
+}
+
+/// Abstract class for the `DocumentExtractor` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyDocumentExtractor implements DocumentExtractor {
+///   @override
+///   Future<InternalDocumentBridge> extractBytes(...) async { ... }
+///   @override
+///   Future<InternalDocumentBridge> extractFile(...) async { ... }
+///   @override
+///   Future<List<String>> supportedMimeTypes(...) async { ... }
+///   @override
+///   Future<int> priority(...) async { ... }
+///   @override
+///   Future<bool> canHandle(...) async { ... }
+/// }
+///
+/// final impl = createDocumentExtractorDartImpl(
+///   extractBytes: (...) => myInstance.extractBytes(...),
+///   extractFile: (...) => myInstance.extractFile(...),
+///   supportedMimeTypes: (...) => myInstance.supportedMimeTypes(...),
+///   priority: (...) => myInstance.priority(...),
+///   canHandle: (...) => myInstance.canHandle(...),
+/// );
+/// ```
+///
+abstract class DocumentExtractor {
+  /// Extract content from a byte array.
+  ///
+  /// This is the core extraction method that processes in-memory document data.
+  ///
+  /// # Arguments
+  ///
+  /// * `content` - Raw document bytes
+  /// * `mime_type` - MIME type of the document (already validated)
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// An `InternalDocument` containing the extracted elements, metadata, and tables.
+  /// The pipeline will convert this into the public `ExtractionResult`.
+  ///
+  /// # Errors
+  ///
+  /// - `KreuzbergError::Parsing` - Document parsing failed
+  /// - `KreuzbergError::Validation` - Invalid document structure
+  /// - `KreuzbergError::Io` - I/O errors (these always bubble up)
+  /// - `KreuzbergError::MissingDependency` - Required dependency not available
+  /// throws anyhow::Error on failure
+  Future<InternalDocumentBridge> extractBytes(Uint8List content, String mimeType, ExtractionConfig config);
+  /// Extract content from a file.
+  ///
+  /// Default implementation reads the file and calls `extract_bytes`.
+  /// Override for custom file handling, streaming, or memory optimizations.
+  ///
+  /// # Arguments
+  ///
+  /// * `path` - Path to the document file
+  /// * `mime_type` - MIME type of the document (already validated)
+  /// * `config` - Extraction configuration
+  ///
+  /// # Returns
+  ///
+  /// An `InternalDocument` containing the extracted elements, metadata, and tables.
+  ///
+  /// # Errors
+  ///
+  /// Same as `extract_bytes`, plus file I/O errors.
+  /// throws anyhow::Error on failure
+  Future<InternalDocumentBridge> extractFile(String path, String mimeType, ExtractionConfig config);
+  /// Get the list of MIME types supported by this extractor.
+  ///
+  /// Can include exact MIME types and prefix patterns:
+  /// - Exact: `"application/pdf"`, `"text/plain"`
+  /// - Prefix: `"image/*"` (matches any image type)
+  ///
+  /// # Returns
+  ///
+  /// A slice of MIME type strings.
+  Future<List<String>> supportedMimeTypes();
+  /// Get the priority of this extractor.
+  ///
+  /// Higher priority extractors are preferred when multiple extractors
+  /// support the same MIME type.
+  ///
+  /// # Priority Guidelines
+  ///
+  /// - **0-25**: Fallback/low-quality extractors
+  /// - **26-49**: Alternative extractors
+  /// - **50**: Default priority (built-in extractors)
+  /// - **51-75**: Premium/enhanced extractors
+  /// - **76-100**: Specialized/high-priority extractors
+  ///
+  /// # Returns
+  ///
+  /// Priority value (default: 50)
+  Future<int> priority();
+  /// Optional: Check if this extractor can handle a specific file.
+  ///
+  /// Allows for more sophisticated detection beyond MIME types.
+  /// Defaults to `true` (rely on MIME type matching).
+  ///
+  /// # Arguments
+  ///
+  /// * `path` - Path to the file to check
+  /// * `mime_type` - Detected MIME type
+  ///
+  /// # Returns
+  ///
+  /// `true` if the extractor can handle this file, `false` otherwise.
+  Future<bool> canHandle(String path, String mimeType);
+}
+
+/// Abstract class for the `Renderer` Rust trait.
+///
+/// Implement this class and register your implementation via:
+/// ```dart
+/// class MyRenderer implements Renderer {
+///   @override
+///   Future<String> render(...) async { ... }
+/// }
+///
+/// final impl = createRendererDartImpl(
+///   render: (...) => myInstance.render(...),
+/// );
+/// ```
+///
+abstract class Renderer {
+  /// Render an [`InternalDocument`] to the output format.
+  ///
+  /// # Arguments
+  ///
+  /// * `doc` - The internal document to render
+  ///
+  /// # Returns
+  ///
+  /// The rendered output as a string.
+  ///
+  /// # Errors
+  ///
+  /// Returns an error if rendering fails.
+  /// throws anyhow::Error on failure
+  Future<String> render(InternalDocumentBridge doc);
+}
+
+
+/// Synchronous extractor trait stub — used by e2e test plugin_api stubs.
+abstract class SyncExtractor {}