This commit is contained in:
3
packages/dart/lib/kreuzberg.dart
generated
Normal file
3
packages/dart/lib/kreuzberg.dart
generated
Normal file
@@ -0,0 +1,3 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
|
||||
export 'src/kreuzberg.dart';
|
||||
93
packages/dart/lib/src/LICENSE
generated
Normal file
93
packages/dart/lib/src/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
642
packages/dart/lib/src/kreuzberg.dart
generated
Normal file
642
packages/dart/lib/src/kreuzberg.dart
generated
Normal file
@@ -0,0 +1,642 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
|
||||
import 'dart:typed_data';
|
||||
|
||||
export 'kreuzberg_bridge_generated/lib.dart';
|
||||
export 'traits.dart';
|
||||
import 'kreuzberg_bridge_generated/lib.dart' as rust_bridge;
|
||||
// ignore: duplicate_import
|
||||
import 'kreuzberg_bridge_generated/lib.dart';
|
||||
|
||||
class KreuzbergBridge {
|
||||
/// Extract content from a byte array.
|
||||
///
|
||||
/// This is the main entry point for in-memory extraction. It performs the following steps:
|
||||
/// 1. Validate MIME type
|
||||
/// 2. Handle legacy format conversion if needed
|
||||
/// 3. Select appropriate extractor from registry
|
||||
/// 4. Extract content
|
||||
/// 5. Run post-processing pipeline
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `content` - The byte array to extract
|
||||
/// * `mime_type` - MIME type of the content
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `ExtractionResult` containing the extracted content and metadata.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `KreuzbergError::Validation` if MIME type is invalid.
|
||||
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::extract_bytes;
|
||||
/// use kreuzberg::core::config::ExtractionConfig;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let bytes = b"Hello, world!";
|
||||
/// let result = extract_bytes(bytes, "text/plain", &config).await?;
|
||||
/// println!("Content: {}", result.content);
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<ExtractionResult> extractBytes(Uint8List content, String mimeType, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.extractBytes(content: content, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Extract content from a file.
|
||||
///
|
||||
/// This is the main entry point for file-based extraction. It performs the following steps:
|
||||
/// 1. Check cache for existing result (if caching enabled)
|
||||
/// 2. Detect or validate MIME type
|
||||
/// 3. Select appropriate extractor from registry
|
||||
/// 4. Extract content
|
||||
/// 5. Run post-processing pipeline
|
||||
/// 6. Store result in cache (if caching enabled)
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the file to extract
|
||||
/// * `mime_type` - Optional MIME type override. If None, will be auto-detected
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `ExtractionResult` containing the extracted content and metadata.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
||||
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::extract_file;
|
||||
/// use kreuzberg::core::config::ExtractionConfig;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let result = extract_file("document.pdf", None, &config).await?;
|
||||
/// println!("Content: {}", result.content);
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<ExtractionResult> extractFile(String path, String? mimeType, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.extractFile(path: path, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Synchronous wrapper for `extract_file`.
|
||||
///
|
||||
/// This is a convenience function that blocks the current thread until extraction completes.
|
||||
/// For async code, use `extract_file` directly.
|
||||
///
|
||||
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||||
/// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
||||
///
|
||||
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
||||
/// use a truly synchronous extraction approach instead.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::extract_file_sync;
|
||||
/// use kreuzberg::core::config::ExtractionConfig;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let result = extract_file_sync("document.pdf", None, &config)?;
|
||||
/// println!("Content: {}", result.content);
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<ExtractionResult> extractFileSync(String path, String? mimeType, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.extractFileSync(path: path, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Synchronous wrapper for `extract_bytes`.
|
||||
///
|
||||
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||||
/// a new runtime per call.
|
||||
///
|
||||
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
||||
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::extract_bytes_sync;
|
||||
/// use kreuzberg::core::config::ExtractionConfig;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let bytes = b"Hello, world!";
|
||||
/// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
|
||||
/// println!("Content: {}", result.content);
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<ExtractionResult> extractBytesSync(Uint8List content, String mimeType, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.extractBytesSync(content: content, mimeType: mimeType, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: 0, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Synchronous wrapper for `batch_extract_files`.
|
||||
///
|
||||
/// Uses the global Tokio runtime for optimal performance.
|
||||
/// Only available with `tokio-runtime` (WASM has no filesystem).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_files_sync;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchFileItem {
|
||||
/// path: "doc1.pdf".into(),
|
||||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||||
/// },
|
||||
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
||||
/// ];
|
||||
/// let results = batch_extract_files_sync(items, &config)?;
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<ExtractionResult>> batchExtractFilesSync(List<BatchFileItem> items, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.batchExtractFilesSync(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Synchronous wrapper for `batch_extract_bytes`.
|
||||
///
|
||||
/// Uses the global Tokio runtime for optimal performance.
|
||||
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
||||
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
||||
/// that iterates through items and calls `extract_bytes_sync()`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_bytes_sync;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||||
/// BatchBytesItem {
|
||||
/// content: b"other".to_vec(),
|
||||
/// mime_type: "text/plain".to_string(),
|
||||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||||
/// },
|
||||
/// ];
|
||||
/// let results = batch_extract_bytes_sync(items, &config)?;
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<ExtractionResult>> batchExtractBytesSync(List<BatchBytesItem> items, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.batchExtractBytesSync(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Extract content from multiple files concurrently.
|
||||
///
|
||||
/// This function processes multiple files in parallel, automatically managing
|
||||
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
||||
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||||
/// to `(num_cpus * 1.5).ceil()`.
|
||||
///
|
||||
/// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||||
/// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
||||
/// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
||||
/// taken from the batch-level `config`.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `items` - Vector of `BatchFileItem` structs, each containing a path and optional
|
||||
/// per-file configuration overrides.
|
||||
/// * `config` - Batch-level extraction configuration (provides defaults and batch settings)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of `ExtractionResult` in the same order as the input items.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Individual file errors are captured in the result metadata. System errors
|
||||
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Simple usage with no per-file overrides:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_files;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem};
|
||||
/// use std::path::PathBuf;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchFileItem { path: "doc1.pdf".into(), config: None },
|
||||
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
||||
/// ];
|
||||
/// let results = batch_extract_files(items, &config).await?;
|
||||
/// println!("Processed {} files", results.len());
|
||||
/// ```
|
||||
///
|
||||
/// Per-file configuration overrides:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_files;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
|
||||
/// use std::path::PathBuf;
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchFileItem {
|
||||
/// path: "scan.pdf".into(),
|
||||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||||
/// },
|
||||
/// BatchFileItem { path: "notes.txt".into(), config: None },
|
||||
/// ];
|
||||
/// let results = batch_extract_files(items, &config).await?;
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<ExtractionResult>> batchExtractFiles(List<BatchFileItem> items, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.batchExtractFiles(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Extract content from multiple byte arrays concurrently.
|
||||
///
|
||||
/// This function processes multiple byte arrays in parallel, automatically managing
|
||||
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
||||
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||||
/// to `(num_cpus * 1.5).ceil()`.
|
||||
///
|
||||
/// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||||
/// fields from the batch-level `config`. Pass `None` as the config to use
|
||||
/// the batch-level defaults for that item.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `items` - Vector of `BatchBytesItem` structs, each containing content bytes,
|
||||
/// MIME type, and optional per-item configuration overrides.
|
||||
/// * `config` - Batch-level extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of `ExtractionResult` in the same order as the input items.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Simple usage with no per-item overrides:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_bytes;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem};
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||||
/// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||||
/// ];
|
||||
/// let results = batch_extract_bytes(items, &config).await?;
|
||||
/// println!("Processed {} items", results.len());
|
||||
/// ```
|
||||
///
|
||||
/// Per-item configuration overrides:
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::core::extractor::batch_extract_bytes;
|
||||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
|
||||
///
|
||||
/// let config = ExtractionConfig::default();
|
||||
/// let items = vec![
|
||||
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||||
/// BatchBytesItem {
|
||||
/// content: b"<html>test</html>".to_vec(),
|
||||
/// mime_type: "text/html".to_string(),
|
||||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||||
/// },
|
||||
/// ];
|
||||
/// let results = batch_extract_bytes(items, &config).await?;
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<ExtractionResult>> batchExtractBytes(List<BatchBytesItem> items, [ExtractionConfig? config]) async {
|
||||
return await rust_bridge.batchExtractBytes(items: items, config: config ?? ExtractionConfig(useCache: true, enableQualityProcessing: true, ocr: null, forceOcr: false, forceOcrPages: null, disableOcr: false, chunking: null, contentFilter: null, images: null, pdfOptions: null, tokenReduction: null, languageDetection: null, pages: null, keywords: null, postprocessor: null, htmlOptions: null, htmlOutput: null, extractionTimeoutSecs: null, maxConcurrentExtractions: null, resultFormat: ResultFormat.unified, securityLimits: null, maxEmbeddedFileBytes: 0, outputFormat: OutputFormat.plain(), layout: null, useLayoutForMarkdown: false, includeDocumentStructure: false, acceleration: null, cacheNamespace: null, cacheTtlSecs: null, email: null, concurrency: null, maxArchiveDepth: 0, treeSitter: null, structuredExtraction: null, cancelToken: null));
|
||||
}
|
||||
|
||||
/// Detect MIME type from raw file bytes.
|
||||
///
|
||||
/// Uses magic byte signatures to detect file type from content.
|
||||
/// Falls back to `infer` crate for comprehensive detection.
|
||||
///
|
||||
/// For ZIP-based files, inspects contents to distinguish Office Open XML
|
||||
/// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `content` - Raw file bytes
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The detected MIME type string.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<String> detectMimeTypeFromBytes(Uint8List content) async {
|
||||
return await rust_bridge.detectMimeTypeFromBytes(content: content);
|
||||
}
|
||||
|
||||
/// Get file extensions for a given MIME type.
|
||||
///
|
||||
/// Returns all known file extensions that map to the specified MIME type.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `mime_type` - The MIME type to look up
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of file extensions (without leading dot) for the MIME type.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use kreuzberg::core::mime::get_extensions_for_mime;
|
||||
///
|
||||
/// let extensions = get_extensions_for_mime("application/pdf").unwrap();
|
||||
/// assert_eq!(extensions, vec!["pdf"]);
|
||||
///
|
||||
/// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document").unwrap();
|
||||
/// assert!(doc_extensions.contains(&"docx".to_string()));
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> getExtensionsForMime(String mimeType) async {
|
||||
return await rust_bridge.getExtensionsForMime(mimeType: mimeType);
|
||||
}
|
||||
|
||||
/// List the names of all registered embedding backends.
|
||||
///
|
||||
/// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
||||
/// bindings.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listEmbeddingBackends() async {
|
||||
return await rust_bridge.listEmbeddingBackends();
|
||||
}
|
||||
|
||||
/// List names of all registered document extractors.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listDocumentExtractors() async {
|
||||
return await rust_bridge.listDocumentExtractors();
|
||||
}
|
||||
|
||||
/// List all registered OCR backends.
|
||||
///
|
||||
/// Returns the names of all OCR backends currently registered in the global registry.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of OCR backend names.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use kreuzberg::plugins::list_ocr_backends;
|
||||
///
|
||||
/// let backends = list_ocr_backends()?;
|
||||
/// for name in backends {
|
||||
/// println!("Registered OCR backend: {}", name);
|
||||
/// }
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listOcrBackends() async {
|
||||
return await rust_bridge.listOcrBackends();
|
||||
}
|
||||
|
||||
/// List all registered post-processor names.
|
||||
///
|
||||
/// Returns a vector of all post-processor names currently registered in the
|
||||
/// global registry.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - `Ok(Vec<String>)` - Vector of post-processor names
|
||||
/// - `Err(...)` if the registry lock is poisoned
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use kreuzberg::plugins::list_post_processors;
|
||||
///
|
||||
/// let processors = list_post_processors()?;
|
||||
/// for name in processors {
|
||||
/// println!("Registered post-processor: {}", name);
|
||||
/// }
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listPostProcessors() async {
|
||||
return await rust_bridge.listPostProcessors();
|
||||
}
|
||||
|
||||
/// List names of all registered renderers.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the registry lock is poisoned.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listRenderers() async {
|
||||
return await rust_bridge.listRenderers();
|
||||
}
|
||||
|
||||
/// List names of all registered validators.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<String>> listValidators() async {
|
||||
return await rust_bridge.listValidators();
|
||||
}
|
||||
|
||||
/// Compare two extraction results and return a structured diff.
|
||||
///
|
||||
/// The comparison is purely structural — no I/O, no side effects. All fields
|
||||
/// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `a` — the "before" extraction result
|
||||
/// * `b` — the "after" extraction result
|
||||
/// * `opts` — controls which sections are compared and optional truncation
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
|
||||
///
|
||||
/// let mut a = ExtractionResult::default();
|
||||
/// let mut b = ExtractionResult::default();
|
||||
/// a.content = "Hello world".to_string();
|
||||
/// b.content = "Hello Rust".to_string();
|
||||
///
|
||||
/// let diff = compare(&a, &b, &DiffOptions::default());
|
||||
/// assert_eq!(diff.content_diff.len(), 1);
|
||||
/// ```
|
||||
static Future<ExtractionDiff> compare(ExtractionResult a, ExtractionResult b, DiffOptions opts) async {
|
||||
return await rust_bridge.compare(a: a, b: b, opts: opts);
|
||||
}
|
||||
|
||||
/// Generate embeddings asynchronously for a list of text strings.
|
||||
///
|
||||
/// This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
||||
/// ONNX inference work to a dedicated blocking thread pool via Tokio's
|
||||
/// `spawn_blocking`, keeping the async executor free.
|
||||
///
|
||||
/// Returns one embedding vector per input text in the same order.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `texts` - Vec of strings to embed (owned, sent to blocking thread)
|
||||
/// * `config` - Embedding configuration specifying model, batch size, and normalization
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
||||
/// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
||||
/// or the blocking inference task panics
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// use kreuzberg::{embed_texts_async, EmbeddingConfig};
|
||||
///
|
||||
/// let embeddings = embed_texts_async(
|
||||
/// vec!["Hello!".to_string()],
|
||||
/// &EmbeddingConfig::default(),
|
||||
/// ).await?;
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<Float64List>> embedTextsAsync(List<String> texts, EmbeddingConfig config) async {
|
||||
return await rust_bridge.embedTextsAsync(texts: texts, config: config);
|
||||
}
|
||||
|
||||
/// Render a single PDF page to PNG bytes.
|
||||
///
|
||||
/// Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
||||
/// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `pdf_bytes` - Raw PDF file bytes
|
||||
/// * `page_index` - Zero-based page index
|
||||
/// * `dpi` - Resolution in dots per inch (default: 150)
|
||||
/// * `password` - Optional password for encrypted PDFs
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
||||
/// or rendered, or if `page_index` is out of range.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<Uint8List> renderPdfPageToPng(Uint8List pdfBytes, int pageIndex, {int? dpi, String? password}) async {
|
||||
return await rust_bridge.renderPdfPageToPng(pdfBytes: pdfBytes, pageIndex: pageIndex, dpi: dpi, password: password);
|
||||
}
|
||||
|
||||
/// Detect the MIME type of a file at the given path.
|
||||
///
|
||||
/// Uses the file extension and optionally the file content to determine the MIME type.
|
||||
/// Set `check_exists` to `true` to verify the file exists before detection.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<String> detectMimeType(String path, bool checkExists) async {
|
||||
return await rust_bridge.detectMimeType(path: path, checkExists: checkExists);
|
||||
}
|
||||
|
||||
/// Embed a list of texts using the configured embedding model.
|
||||
///
|
||||
/// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
||||
/// throws anyhow::Error on failure
|
||||
static Future<List<Float64List>> embedTexts(List<String> texts, EmbeddingConfig config) async {
|
||||
return await rust_bridge.embedTexts(texts: texts, config: config);
|
||||
}
|
||||
|
||||
/// Get an embedding preset by name.
|
||||
///
|
||||
/// Returns `None` if no preset with the given name exists. Returns an owned
|
||||
/// clone so the value is safe to pass across FFI boundaries.
|
||||
static Future<EmbeddingPreset?> getEmbeddingPreset(String name) async {
|
||||
return await rust_bridge.getEmbeddingPreset(name: name);
|
||||
}
|
||||
|
||||
/// List the names of all available embedding presets.
|
||||
///
|
||||
/// Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
||||
static Future<List<String>> listEmbeddingPresets() async {
|
||||
return await rust_bridge.listEmbeddingPresets();
|
||||
}
|
||||
|
||||
/// Register a Dart implementation of [OcrBackend] with the plugin registry.
|
||||
static Future<void> registerOcrBackend(OcrBackendDartImpl impl) async {
|
||||
await rust_bridge.registerOcrBackend(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [OcrBackend] plugin by name.
|
||||
static Future<void> unregisterOcrBackend(String name) async {
|
||||
await rust_bridge.unregisterOcrBackend(name: name);
|
||||
}
|
||||
/// Clear all registered [OcrBackend] plugins from the registry.
|
||||
static Future<void> clearOcrBackends() async {
|
||||
await rust_bridge.clearOcrBackends();
|
||||
}
|
||||
/// Register a Dart implementation of [PostProcessor] with the plugin registry.
|
||||
static Future<void> registerPostProcessor(PostProcessorDartImpl impl) async {
|
||||
await rust_bridge.registerPostProcessor(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [PostProcessor] plugin by name.
|
||||
static Future<void> unregisterPostProcessor(String name) async {
|
||||
await rust_bridge.unregisterPostProcessor(name: name);
|
||||
}
|
||||
/// Clear all registered [PostProcessor] plugins from the registry.
|
||||
static Future<void> clearPostProcessors() async {
|
||||
await rust_bridge.clearPostProcessors();
|
||||
}
|
||||
/// Register a Dart implementation of [Validator] with the plugin registry.
|
||||
static Future<void> registerValidator(ValidatorDartImpl impl) async {
|
||||
await rust_bridge.registerValidator(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [Validator] plugin by name.
|
||||
static Future<void> unregisterValidator(String name) async {
|
||||
await rust_bridge.unregisterValidator(name: name);
|
||||
}
|
||||
/// Clear all registered [Validator] plugins from the registry.
|
||||
static Future<void> clearValidators() async {
|
||||
await rust_bridge.clearValidators();
|
||||
}
|
||||
/// Register a Dart implementation of [EmbeddingBackend] with the plugin registry.
|
||||
static Future<void> registerEmbeddingBackend(EmbeddingBackendDartImpl impl) async {
|
||||
await rust_bridge.registerEmbeddingBackend(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [EmbeddingBackend] plugin by name.
|
||||
static Future<void> unregisterEmbeddingBackend(String name) async {
|
||||
await rust_bridge.unregisterEmbeddingBackend(name: name);
|
||||
}
|
||||
/// Clear all registered [EmbeddingBackend] plugins from the registry.
|
||||
static Future<void> clearEmbeddingBackends() async {
|
||||
await rust_bridge.clearEmbeddingBackends();
|
||||
}
|
||||
/// Register a Dart implementation of [DocumentExtractor] with the plugin registry.
|
||||
static Future<void> registerDocumentExtractor(DocumentExtractorDartImpl impl) async {
|
||||
await rust_bridge.registerDocumentExtractor(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [DocumentExtractor] plugin by name.
|
||||
static Future<void> unregisterDocumentExtractor(String name) async {
|
||||
await rust_bridge.unregisterDocumentExtractor(name: name);
|
||||
}
|
||||
/// Clear all registered [DocumentExtractor] plugins from the registry.
|
||||
static Future<void> clearDocumentExtractors() async {
|
||||
await rust_bridge.clearDocumentExtractors();
|
||||
}
|
||||
/// Register a Dart implementation of [Renderer] with the plugin registry.
|
||||
static Future<void> registerRenderer(RendererDartImpl impl) async {
|
||||
await rust_bridge.registerRenderer(impl: impl);
|
||||
}
|
||||
/// Unregister a previously-registered [Renderer] plugin by name.
|
||||
static Future<void> unregisterRenderer(String name) async {
|
||||
await rust_bridge.unregisterRenderer(name: name);
|
||||
}
|
||||
/// Clear all registered [Renderer] plugins from the registry.
|
||||
static Future<void> clearRenderers() async {
|
||||
await rust_bridge.clearRenderers();
|
||||
}
|
||||
}
|
||||
24982
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.dart
generated
Normal file
24982
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.dart
generated
Normal file
File diff suppressed because it is too large
Load Diff
6311
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.io.dart
generated
Normal file
6311
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.io.dart
generated
Normal file
File diff suppressed because it is too large
Load Diff
6215
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.web.dart
generated
Normal file
6215
packages/dart/lib/src/kreuzberg_bridge_generated/frb_generated.web.dart
generated
Normal file
File diff suppressed because it is too large
Load Diff
10024
packages/dart/lib/src/kreuzberg_bridge_generated/lib.dart
generated
Normal file
10024
packages/dart/lib/src/kreuzberg_bridge_generated/lib.dart
generated
Normal file
File diff suppressed because it is too large
Load Diff
6234
packages/dart/lib/src/kreuzberg_bridge_generated/lib.freezed.dart
generated
Normal file
6234
packages/dart/lib/src/kreuzberg_bridge_generated/lib.freezed.dart
generated
Normal file
File diff suppressed because it is too large
Load Diff
646
packages/dart/lib/src/traits.dart
generated
Normal file
646
packages/dart/lib/src/traits.dart
generated
Normal file
@@ -0,0 +1,646 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
|
||||
import 'kreuzberg_bridge_generated/lib.dart';
|
||||
import 'dart:typed_data';
|
||||
|
||||
/// Abstract class for the `OcrBackend` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyOcrBackend implements OcrBackend {
|
||||
/// @override
|
||||
/// Future<ExtractionResult> processImage(...) async { ... }
|
||||
/// @override
|
||||
/// Future<ExtractionResult> processImageFile(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> supportsLanguage(...) async { ... }
|
||||
/// @override
|
||||
/// Future<OcrBackendType> backendType(...) async { ... }
|
||||
/// @override
|
||||
/// Future<List<String>> supportedLanguages(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> supportsTableDetection(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> supportsDocumentProcessing(...) async { ... }
|
||||
/// @override
|
||||
/// Future<ExtractionResult> processDocument(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createOcrBackendDartImpl(
|
||||
/// processImage: (...) => myInstance.processImage(...),
|
||||
/// processImageFile: (...) => myInstance.processImageFile(...),
|
||||
/// supportsLanguage: (...) => myInstance.supportsLanguage(...),
|
||||
/// backendType: (...) => myInstance.backendType(...),
|
||||
/// supportedLanguages: (...) => myInstance.supportedLanguages(...),
|
||||
/// supportsTableDetection: (...) => myInstance.supportsTableDetection(...),
|
||||
/// supportsDocumentProcessing: (...) => myInstance.supportsDocumentProcessing(...),
|
||||
/// processDocument: (...) => myInstance.processDocument(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class OcrBackend {
|
||||
/// Process an image and extract text via OCR.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `image_bytes` - Raw image data (JPEG, PNG, TIFF, etc.)
|
||||
/// * `config` - OCR configuration (language, PSM mode, etc.)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `ExtractionResult` containing the extracted text and metadata.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - `KreuzbergError::Ocr` - OCR processing failed
|
||||
/// - `KreuzbergError::Validation` - Invalid image format or configuration
|
||||
/// - `KreuzbergError::Io` - I/O errors (these always bubble up)
|
||||
///
|
||||
/// # Reading `backend_options`
|
||||
///
|
||||
/// Backends that support runtime tuning can read `config.backend_options` and
|
||||
/// deserialize only the keys they care about. Unknown keys are silently ignored,
|
||||
/// so multiple backends can coexist in a pipeline without key conflicts.
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
||||
/// // Read backend-specific options; unknown keys are silently ignored.
|
||||
/// let fast_mode = config.backend_options
|
||||
/// .as_ref()
|
||||
/// .and_then(|v| v.get("mode"))
|
||||
/// .and_then(|v| v.as_str())
|
||||
/// .map(|s| s == "fast")
|
||||
/// .unwrap_or(false);
|
||||
///
|
||||
/// if image_bytes.is_empty() {
|
||||
/// return Err(kreuzberg::KreuzbergError::Validation {
|
||||
/// message: "Empty image data".to_string(),
|
||||
/// source: None,
|
||||
/// });
|
||||
/// }
|
||||
///
|
||||
/// let text = if fast_mode {
|
||||
/// "Fast OCR result".to_string()
|
||||
/// } else {
|
||||
/// format!("Extracted text in language: {}", config.language)
|
||||
/// };
|
||||
///
|
||||
/// Ok(ExtractionResult {
|
||||
/// content: text,
|
||||
/// mime_type: Cow::Borrowed("text/plain"),
|
||||
/// ..Default::default()
|
||||
/// })
|
||||
/// }
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
Future<ExtractionResult> processImage(Uint8List imageBytes, OcrConfig config);
|
||||
/// Process a file and extract text via OCR.
|
||||
///
|
||||
/// Default implementation reads the file and calls `process_image`.
|
||||
/// Override for custom file handling or optimizations.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the image file
|
||||
/// * `config` - OCR configuration
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Same as `process_image`, plus file I/O errors.
|
||||
/// throws anyhow::Error on failure
|
||||
Future<ExtractionResult> processImageFile(String path, OcrConfig config);
|
||||
/// Check if this backend supports a given language code.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `lang` - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `true` if the language is supported, `false` otherwise.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// fn supports_language(&self, lang: &str) -> bool {
|
||||
/// self.languages.contains(&lang.to_string())
|
||||
/// }
|
||||
/// ```
|
||||
Future<bool> supportsLanguage(String lang);
|
||||
/// Get the backend type identifier.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The backend type enum value.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// fn backend_type(&self) -> OcrBackendType {
|
||||
/// OcrBackendType::Tesseract
|
||||
/// }
|
||||
/// ```
|
||||
Future<OcrBackendType> backendType();
|
||||
/// Optional: Get a list of all supported languages.
|
||||
///
|
||||
/// Defaults to empty list. Override to provide comprehensive language support info.
|
||||
Future<List<String>> supportedLanguages();
|
||||
/// Optional: Check if the backend supports table detection.
|
||||
///
|
||||
/// Defaults to `false`. Override if your backend can detect and extract tables.
|
||||
Future<bool> supportsTableDetection();
|
||||
/// Check if the backend supports direct document-level processing (e.g. for PDFs).
|
||||
///
|
||||
/// Defaults to `false`. Override if the backend has optimized document processing.
|
||||
Future<bool> supportsDocumentProcessing();
|
||||
/// Process a document file directly via OCR.
|
||||
///
|
||||
/// Only called if `supports_document_processing` returns `true`.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the document file (e.g. .pdf)
|
||||
/// * `config` - OCR configuration
|
||||
/// throws anyhow::Error on failure
|
||||
Future<ExtractionResult> processDocument(String path, OcrConfig config);
|
||||
}
|
||||
|
||||
/// Abstract class for the `PostProcessor` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyPostProcessor implements PostProcessor {
|
||||
/// @override
|
||||
/// Future<void> process(...) async { ... }
|
||||
/// @override
|
||||
/// Future<ProcessingStage> processingStage(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> shouldProcess(...) async { ... }
|
||||
/// @override
|
||||
/// Future<int> estimatedDurationMs(...) async { ... }
|
||||
/// @override
|
||||
/// Future<int> priority(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createPostProcessorDartImpl(
|
||||
/// process: (...) => myInstance.process(...),
|
||||
/// processingStage: (...) => myInstance.processingStage(...),
|
||||
/// shouldProcess: (...) => myInstance.shouldProcess(...),
|
||||
/// estimatedDurationMs: (...) => myInstance.estimatedDurationMs(...),
|
||||
/// priority: (...) => myInstance.priority(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class PostProcessor {
|
||||
/// Process an extraction result.
|
||||
///
|
||||
/// Transform or enrich the extraction result. Can modify:
|
||||
/// - `content` - The extracted text
|
||||
/// - `metadata` - Add or update metadata fields
|
||||
/// - `tables` - Modify or enhance table data
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `result` - Mutable reference to the extraction result to process
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Return errors for fatal processing failures. Non-fatal errors should be
|
||||
/// captured in metadata directly on the result.
|
||||
///
|
||||
/// # Performance
|
||||
///
|
||||
/// This signature avoids unnecessary cloning of large extraction results by
|
||||
/// taking a mutable reference instead of ownership. Processors modify the
|
||||
/// result in place.
|
||||
///
|
||||
/// # Example - Language Detection
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||||
/// -> Result<()> {
|
||||
/// // Detect language (simplified - use real detection library in practice)
|
||||
/// let language = "en"; // Placeholder detection
|
||||
///
|
||||
/// // Add to metadata
|
||||
/// result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Example - Text Cleaning
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||||
/// -> Result<()> {
|
||||
/// // Remove excessive whitespace
|
||||
/// result.content = result
|
||||
/// .content
|
||||
/// .split_whitespace()
|
||||
/// .collect::<Vec<_>>()
|
||||
/// .join(" ");
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
Future<void> process(ExtractionResult result, ExtractionConfig config);
|
||||
/// Get the processing stage for this post-processor.
|
||||
///
|
||||
/// Determines when this processor runs in the pipeline.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The `ProcessingStage` (Early, Middle, or Late).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// fn processing_stage(&self) -> ProcessingStage {
|
||||
/// ProcessingStage::Early // Run before other processors
|
||||
/// }
|
||||
/// ```
|
||||
Future<ProcessingStage> processingStage();
|
||||
/// Optional: Check if this processor should run for a given result.
|
||||
///
|
||||
/// Allows conditional processing based on MIME type, metadata, or content.
|
||||
/// Defaults to `true` (always run).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `result` - The extraction result to check
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `true` if the processor should run, `false` to skip.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// /// Only process PDF documents
|
||||
/// fn should_process(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
||||
/// result.mime_type == "application/pdf"
|
||||
/// }
|
||||
/// ```
|
||||
Future<bool> shouldProcess(ExtractionResult result, ExtractionConfig config);
|
||||
/// Optional: Estimate processing time in milliseconds.
|
||||
///
|
||||
/// Used for logging and debugging. Defaults to 0 (unknown).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `result` - The extraction result to estimate for
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Estimated processing time in milliseconds.
|
||||
Future<int> estimatedDurationMs(ExtractionResult result);
|
||||
/// Execution priority within the processing stage.
|
||||
///
|
||||
/// Higher values run first within the same `ProcessingStage`. Defaults to 50.
|
||||
/// Use 0-49 for fallback processors, 50 for normal processors, and 51-255
|
||||
/// for high-priority processors that should run early in their stage.
|
||||
Future<int> priority();
|
||||
}
|
||||
|
||||
/// Abstract class for the `Validator` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyValidator implements Validator {
|
||||
/// @override
|
||||
/// Future<void> validate(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> shouldValidate(...) async { ... }
|
||||
/// @override
|
||||
/// Future<int> priority(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createValidatorDartImpl(
|
||||
/// validate: (...) => myInstance.validate(...),
|
||||
/// shouldValidate: (...) => myInstance.shouldValidate(...),
|
||||
/// priority: (...) => myInstance.priority(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class Validator {
|
||||
/// Validate an extraction result.
|
||||
///
|
||||
/// Check the extraction result and return `Ok(())` if valid, or an error
|
||||
/// if validation fails.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `result` - The extraction result to validate
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// - `Ok(())` if validation passes
|
||||
/// - `Err(...)` if validation fails (extraction will fail)
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - `KreuzbergError::Validation` - Validation failed
|
||||
/// - Any other error type appropriate for the failure
|
||||
///
|
||||
/// # Example - Content Length Validation
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
/// -> Result<()> {
|
||||
/// let length = result.content.len();
|
||||
///
|
||||
/// if length < self.min {
|
||||
/// return Err(KreuzbergError::validation(format!(
|
||||
/// "Content too short: {} < {} characters",
|
||||
/// length, self.min
|
||||
/// )));
|
||||
/// }
|
||||
///
|
||||
/// if length > self.max {
|
||||
/// return Err(KreuzbergError::validation(format!(
|
||||
/// "Content too long: {} > {} characters",
|
||||
/// length, self.max
|
||||
/// )));
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Example - Quality Score Validation
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
/// -> Result<()> {
|
||||
/// // Check if quality_score exists in metadata
|
||||
/// let score = result.metadata
|
||||
/// .additional
|
||||
/// .get("quality_score")
|
||||
/// .and_then(|v| v.as_f64())
|
||||
/// .unwrap_or(0.0);
|
||||
///
|
||||
/// if score < self.min_score {
|
||||
/// return Err(KreuzbergError::validation(format!(
|
||||
/// "Quality score too low: {} < {}",
|
||||
/// score, self.min_score
|
||||
/// )));
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Example - Security Validation
|
||||
///
|
||||
/// ```rust
|
||||
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||||
/// -> Result<()> {
|
||||
/// // Check for blocked patterns
|
||||
/// for pattern in &self.blocked_patterns {
|
||||
/// if result.content.contains(pattern) {
|
||||
/// return Err(KreuzbergError::validation(format!(
|
||||
/// "Content contains blocked pattern: {}",
|
||||
/// pattern
|
||||
/// )));
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
/// throws anyhow::Error on failure
|
||||
Future<void> validate(ExtractionResult result, ExtractionConfig config);
|
||||
/// Optional: Check if this validator should run for a given result.
|
||||
///
|
||||
/// Allows conditional validation based on MIME type, metadata, or content.
|
||||
/// Defaults to `true` (always run).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `result` - The extraction result to check
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `true` if the validator should run, `false` to skip.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// /// Only validate PDF documents
|
||||
/// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
||||
/// result.mime_type == "application/pdf"
|
||||
/// }
|
||||
/// ```
|
||||
Future<bool> shouldValidate(ExtractionResult result, ExtractionConfig config);
|
||||
/// Optional: Get the validation priority.
|
||||
///
|
||||
/// Higher priority validators run first. Useful for ordering validation checks
|
||||
/// (e.g., run cheap validations before expensive ones).
|
||||
///
|
||||
/// Default priority is 50.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Priority value (higher = runs earlier).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// /// Run this validator first (it's fast)
|
||||
/// fn priority(&self) -> i32 {
|
||||
/// 100
|
||||
/// }
|
||||
/// ```
|
||||
Future<int> priority();
|
||||
}
|
||||
|
||||
/// Abstract class for the `EmbeddingBackend` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyEmbeddingBackend implements EmbeddingBackend {
|
||||
/// @override
|
||||
/// Future<int> dimensions(...) async { ... }
|
||||
/// @override
|
||||
/// Future<List<Float64List>> embed(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createEmbeddingBackendDartImpl(
|
||||
/// dimensions: (...) => myInstance.dimensions(...),
|
||||
/// embed: (...) => myInstance.embed(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class EmbeddingBackend {
|
||||
/// Embedding vector dimension. Must be `> 0` and must match the length of
|
||||
/// every vector returned by `embed`.
|
||||
Future<int> dimensions();
|
||||
/// Embed a batch of texts, returning one vector per input in order.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Implementations should return `Plugin` for
|
||||
/// backend-specific failures. The dispatcher layers its own validation
|
||||
/// (length, per-vector dimension) on top.
|
||||
/// throws anyhow::Error on failure
|
||||
Future<List<Float64List>> embed(List<String> texts);
|
||||
}
|
||||
|
||||
/// Abstract class for the `DocumentExtractor` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyDocumentExtractor implements DocumentExtractor {
|
||||
/// @override
|
||||
/// Future<InternalDocumentBridge> extractBytes(...) async { ... }
|
||||
/// @override
|
||||
/// Future<InternalDocumentBridge> extractFile(...) async { ... }
|
||||
/// @override
|
||||
/// Future<List<String>> supportedMimeTypes(...) async { ... }
|
||||
/// @override
|
||||
/// Future<int> priority(...) async { ... }
|
||||
/// @override
|
||||
/// Future<bool> canHandle(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createDocumentExtractorDartImpl(
|
||||
/// extractBytes: (...) => myInstance.extractBytes(...),
|
||||
/// extractFile: (...) => myInstance.extractFile(...),
|
||||
/// supportedMimeTypes: (...) => myInstance.supportedMimeTypes(...),
|
||||
/// priority: (...) => myInstance.priority(...),
|
||||
/// canHandle: (...) => myInstance.canHandle(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class DocumentExtractor {
|
||||
/// Extract content from a byte array.
|
||||
///
|
||||
/// This is the core extraction method that processes in-memory document data.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `content` - Raw document bytes
|
||||
/// * `mime_type` - MIME type of the document (already validated)
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
/// The pipeline will convert this into the public `ExtractionResult`.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - `KreuzbergError::Parsing` - Document parsing failed
|
||||
/// - `KreuzbergError::Validation` - Invalid document structure
|
||||
/// - `KreuzbergError::Io` - I/O errors (these always bubble up)
|
||||
/// - `KreuzbergError::MissingDependency` - Required dependency not available
|
||||
/// throws anyhow::Error on failure
|
||||
Future<InternalDocumentBridge> extractBytes(Uint8List content, String mimeType, ExtractionConfig config);
|
||||
/// Extract content from a file.
|
||||
///
|
||||
/// Default implementation reads the file and calls `extract_bytes`.
|
||||
/// Override for custom file handling, streaming, or memory optimizations.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the document file
|
||||
/// * `mime_type` - MIME type of the document (already validated)
|
||||
/// * `config` - Extraction configuration
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Same as `extract_bytes`, plus file I/O errors.
|
||||
/// throws anyhow::Error on failure
|
||||
Future<InternalDocumentBridge> extractFile(String path, String mimeType, ExtractionConfig config);
|
||||
/// Get the list of MIME types supported by this extractor.
|
||||
///
|
||||
/// Can include exact MIME types and prefix patterns:
|
||||
/// - Exact: `"application/pdf"`, `"text/plain"`
|
||||
/// - Prefix: `"image/*"` (matches any image type)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A slice of MIME type strings.
|
||||
Future<List<String>> supportedMimeTypes();
|
||||
/// Get the priority of this extractor.
|
||||
///
|
||||
/// Higher priority extractors are preferred when multiple extractors
|
||||
/// support the same MIME type.
|
||||
///
|
||||
/// # Priority Guidelines
|
||||
///
|
||||
/// - **0-25**: Fallback/low-quality extractors
|
||||
/// - **26-49**: Alternative extractors
|
||||
/// - **50**: Default priority (built-in extractors)
|
||||
/// - **51-75**: Premium/enhanced extractors
|
||||
/// - **76-100**: Specialized/high-priority extractors
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Priority value (default: 50)
|
||||
Future<int> priority();
|
||||
/// Optional: Check if this extractor can handle a specific file.
|
||||
///
|
||||
/// Allows for more sophisticated detection beyond MIME types.
|
||||
/// Defaults to `true` (rely on MIME type matching).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - Path to the file to check
|
||||
/// * `mime_type` - Detected MIME type
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `true` if the extractor can handle this file, `false` otherwise.
|
||||
Future<bool> canHandle(String path, String mimeType);
|
||||
}
|
||||
|
||||
/// Abstract class for the `Renderer` Rust trait.
|
||||
///
|
||||
/// Implement this class and register your implementation via:
|
||||
/// ```dart
|
||||
/// class MyRenderer implements Renderer {
|
||||
/// @override
|
||||
/// Future<String> render(...) async { ... }
|
||||
/// }
|
||||
///
|
||||
/// final impl = createRendererDartImpl(
|
||||
/// render: (...) => myInstance.render(...),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
abstract class Renderer {
|
||||
/// Render an [`InternalDocument`] to the output format.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `doc` - The internal document to render
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The rendered output as a string.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if rendering fails.
|
||||
/// throws anyhow::Error on failure
|
||||
Future<String> render(InternalDocumentBridge doc);
|
||||
}
|
||||
|
||||
|
||||
/// Synchronous extractor trait stub — used by e2e test plugin_api stubs.
|
||||
abstract class SyncExtractor {}
|
||||
Reference in New Issue
Block a user