This commit is contained in:
12
docs/snippets/rust/plugins/clear_plugins.md
Normal file
12
docs/snippets/rust/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::{clear_document_extractors, clear_post_processors, clear_ocr_backends, clear_validators};
|
||||
|
||||
fn main() {
|
||||
clear_document_extractors();
|
||||
clear_post_processors();
|
||||
clear_ocr_backends();
|
||||
clear_validators();
|
||||
|
||||
println!("All plugins cleared");
|
||||
}
|
||||
```
|
||||
47
docs/snippets/rust/plugins/embedding_backend.md
Normal file
47
docs/snippets/rust/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{EmbeddingBackend, Plugin, register_embedding_backend};
|
||||
use kreuzberg::{EmbeddingConfig, EmbeddingModelType, Result, embed_texts};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
|
||||
// Wrap an already-loaded embedder (e.g. a tuned ONNX session or any host-language
|
||||
// embedder) so kreuzberg can call back into it during chunking and standalone
|
||||
// embed requests.
|
||||
struct MyEmbedder {
|
||||
// Hold whatever model handles the host already owns.
|
||||
}
|
||||
|
||||
impl Plugin for MyEmbedder {
|
||||
fn name(&self) -> &str { "my-embedder" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl EmbeddingBackend for MyEmbedder {
|
||||
// Captured once at registration; used for shape validation on every dispatch.
|
||||
fn dimensions(&self) -> usize { 768 }
|
||||
|
||||
async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>> {
|
||||
// Delegate to the already-loaded host model.
|
||||
Ok(texts.iter().map(|_| vec![0.0; 768]).collect())
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// Register once at startup.
|
||||
register_embedding_backend(Arc::new(MyEmbedder {}))?;
|
||||
|
||||
let config = EmbeddingConfig {
|
||||
model: EmbeddingModelType::Plugin { name: "my-embedder".to_string() },
|
||||
// Optional: bound the wait on a hung backend (default 60s; `None` disables).
|
||||
max_embed_duration_secs: Some(30),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let vectors = embed_texts(&["Hello, world!", "Second text"], &config)?;
|
||||
assert_eq!(vectors.len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
11
docs/snippets/rust/plugins/extractor_registration.md
Normal file
11
docs/snippets/rust/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::registry::get_document_extractor_registry;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn register_custom_extractor() -> kreuzberg::Result<()> {
|
||||
let extractor = Arc::new(CustomJsonExtractor);
|
||||
let registry = get_document_extractor_registry();
|
||||
registry.write().unwrap().register(extractor)?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/rust/plugins/list_plugins.md
Normal file
19
docs/snippets/rust/plugins/list_plugins.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::registry::*;
|
||||
|
||||
let registry = get_document_extractor_registry();
|
||||
let extractors = registry.list()?;
|
||||
println!("Registered extractors: {:?}", extractors);
|
||||
|
||||
let registry = get_post_processor_registry();
|
||||
let processors = registry.list()?;
|
||||
println!("Registered processors: {:?}", processors);
|
||||
|
||||
let registry = get_ocr_backend_registry();
|
||||
let backends = registry.list()?;
|
||||
println!("Registered OCR backends: {:?}", backends);
|
||||
|
||||
let registry = get_validator_registry();
|
||||
let validators = registry.list()?;
|
||||
println!("Registered validators: {:?}", validators);
|
||||
```
|
||||
38
docs/snippets/rust/plugins/min_length_validator.md
Normal file
38
docs/snippets/rust/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, Validator};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct MinLengthValidator {
|
||||
min_length: usize,
|
||||
}
|
||||
|
||||
impl Plugin for MinLengthValidator {
|
||||
fn name(&self) -> &str { "min-length-validator" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Validator for MinLengthValidator {
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.content.len() < self.min_length {
|
||||
return Err(KreuzbergError::validation(format!(
|
||||
"Content too short: {} < {} characters",
|
||||
result.content.len(),
|
||||
self.min_length
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 {
|
||||
100
|
||||
}
|
||||
}
|
||||
```
|
||||
69
docs/snippets/rust/plugins/pdf_metadata_extractor.md
Normal file
69
docs/snippets/rust/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,69 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use serde_json::json;
|
||||
|
||||
struct PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl Plugin for PdfMetadataExtractor {
|
||||
fn name(&self) -> &str { "pdf-metadata-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> {
|
||||
self.processed_count.store(0, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfMetadataExtractor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.mime_type != "application/pdf" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let order = self.processed_count.fetch_add(1, Ordering::AcqRel) + 1;
|
||||
|
||||
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
|
||||
result.metadata.additional.insert("pdf_order".to_string(), json!(order));
|
||||
result.metadata.additional.insert(
|
||||
"content_length".to_string(),
|
||||
json!(result.content.len()),
|
||||
);
|
||||
result.metadata.additional.insert(
|
||||
"pdf_processor_version".to_string(),
|
||||
json!("1.0.0"),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
register_post_processor(Arc::new(PdfMetadataExtractor {
|
||||
processed_count: AtomicUsize::new(0),
|
||||
}))?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
50
docs/snippets/rust/plugins/pdf_only_processor.md
Normal file
50
docs/snippets/rust/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage, register_post_processor};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
use serde_json::json;
|
||||
|
||||
struct PdfOnlyProcessor;
|
||||
|
||||
impl Plugin for PdfOnlyProcessor {
|
||||
fn name(&self) -> &str { "pdf-only-processor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfOnlyProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.mime_type != "application/pdf" {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
result.metadata.additional.insert("pdf_processed".to_string(), json!(true));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
register_post_processor(Arc::new(PdfOnlyProcessor))?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
53
docs/snippets/rust/plugins/plugin_extractor.md
Normal file
53
docs/snippets/rust/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{DocumentExtractor, Plugin};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
|
||||
struct CustomJsonExtractor;
|
||||
|
||||
impl Plugin for CustomJsonExtractor {
|
||||
fn name(&self) -> &str { "custom-json-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for CustomJsonExtractor {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
_mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
let json: serde_json::Value = serde_json::from_slice(content)?;
|
||||
let text = extract_text_from_json(&json);
|
||||
|
||||
Ok(ExtractionResult {
|
||||
content: text,
|
||||
mime_type: "application/json".to_string(),
|
||||
metadata: Metadata::default(),
|
||||
tables: vec![],
|
||||
detected_languages: None,
|
||||
chunks: None,
|
||||
images: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn supported_mime_types(&self) -> &[&str] {
|
||||
&["application/json", "text/json"]
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
}
|
||||
|
||||
fn extract_text_from_json(value: &serde_json::Value) -> String {
|
||||
match value {
|
||||
serde_json::Value::String(s) => format!("{}\n", s),
|
||||
serde_json::Value::Array(arr) => arr.iter().map(extract_text_from_json).collect(),
|
||||
serde_json::Value::Object(obj) => obj.values().map(extract_text_from_json).collect(),
|
||||
_ => String::new(),
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/rust/plugins/plugin_logging.md
Normal file
35
docs/snippets/rust/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```rust title="Rust"
|
||||
use log::{info, warn, error};
|
||||
|
||||
impl Plugin for MyPlugin {
|
||||
fn initialize(&self) -> Result<()> {
|
||||
info!("Initializing plugin: {}", self.name());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn shutdown(&self) -> Result<()> {
|
||||
info!("Shutting down plugin: {}", self.name());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for MyPlugin {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
info!("Extracting {} ({} bytes)", mime_type, content.len());
|
||||
|
||||
let result = ExtractionResult::default();
|
||||
|
||||
if result.content.is_empty() {
|
||||
warn!("Extraction resulted in empty content");
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/rust/plugins/plugin_testing.md
Normal file
22
docs/snippets/rust/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```rust title="Rust"
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_custom_extractor() {
|
||||
let extractor = CustomJsonExtractor;
|
||||
|
||||
let json_data = br#"{"message": "Hello, world!"}"#;
|
||||
let config = ExtractionConfig::default();
|
||||
|
||||
let result = extractor
|
||||
.extract_bytes(json_data, "application/json", &config)
|
||||
.await
|
||||
.expect("Extraction failed");
|
||||
|
||||
assert!(result.content.contains("Hello, world!"));
|
||||
assert_eq!(result.mime_type, "application/json");
|
||||
}
|
||||
}
|
||||
```
|
||||
61
docs/snippets/rust/plugins/plugin_validator.md
Normal file
61
docs/snippets/rust/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,61 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, Validator, register_validator};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
||||
use async_trait::async_trait;
|
||||
use std::sync::Arc;
|
||||
|
||||
// Generic validator pattern: every Validator has the same shape.
|
||||
// `name()` keys the registry, `priority()` orders execution (higher = earlier),
|
||||
// and `validate()` returns Err on failure.
|
||||
struct GenericValidator<F>
|
||||
where
|
||||
F: Fn(&ExtractionResult, &ExtractionConfig) -> Result<()> + Send + Sync + 'static,
|
||||
{
|
||||
plugin_name: String,
|
||||
plugin_priority: i32,
|
||||
check: F,
|
||||
}
|
||||
|
||||
impl<F> Plugin for GenericValidator<F>
|
||||
where
|
||||
F: Fn(&ExtractionResult, &ExtractionConfig) -> Result<()> + Send + Sync + 'static,
|
||||
{
|
||||
fn name(&self) -> &str { &self.plugin_name }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<F> Validator for GenericValidator<F>
|
||||
where
|
||||
F: Fn(&ExtractionResult, &ExtractionConfig) -> Result<()> + Send + Sync + 'static,
|
||||
{
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
(self.check)(result, config)
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 {
|
||||
self.plugin_priority
|
||||
}
|
||||
}
|
||||
|
||||
fn register_generic_validator() -> Result<()> {
|
||||
let validator = GenericValidator {
|
||||
plugin_name: "non-empty-content".to_string(),
|
||||
plugin_priority: 200,
|
||||
check: |result, _config| {
|
||||
if result.content.trim().is_empty() {
|
||||
return Err(KreuzbergError::validation("Extracted content is blank"));
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
};
|
||||
register_validator(Arc::new(validator))?;
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
25
docs/snippets/rust/plugins/quality_score_validator.md
Normal file
25
docs/snippets/rust/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```rust title="Rust"
|
||||
#[async_trait]
|
||||
impl Validator for QualityValidator {
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
let score = result.metadata
|
||||
.additional
|
||||
.get("quality_score")
|
||||
.and_then(|v| v.as_f64())
|
||||
.unwrap_or(0.0);
|
||||
|
||||
if score < 0.5 {
|
||||
return Err(KreuzbergError::validation(format!(
|
||||
"Quality score too low: {:.2} < 0.50",
|
||||
score
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
```
|
||||
47
docs/snippets/rust/plugins/stateful_plugin.md
Normal file
47
docs/snippets/rust/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```rust title="Rust"
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use kreuzberg::KreuzbergError;
|
||||
|
||||
struct StatefulPlugin {
|
||||
call_count: AtomicUsize,
|
||||
cache: Mutex<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
impl Plugin for StatefulPlugin {
|
||||
fn name(&self) -> &str { "stateful-plugin" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
|
||||
fn initialize(&self) -> Result<()> {
|
||||
self.call_count.store(0, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn shutdown(&self) -> Result<()> {
|
||||
let count = self.call_count.load(Ordering::Acquire);
|
||||
println!("Plugin called {} times", count);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for StatefulPlugin {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
self.call_count.fetch_add(1, Ordering::AcqRel);
|
||||
|
||||
let mut cache = self.cache.lock()
|
||||
.map_err(|_| KreuzbergError::plugin("Cache lock poisoned"))?;
|
||||
cache.insert("last_mime".to_string(), result.mime_type.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Middle
|
||||
}
|
||||
}
|
||||
```
|
||||
6
docs/snippets/rust/plugins/unregister_plugins.md
Normal file
6
docs/snippets/rust/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::registry::get_document_extractor_registry;
|
||||
|
||||
let registry = get_document_extractor_registry();
|
||||
registry.remove("custom-json-extractor")?;
|
||||
```
|
||||
44
docs/snippets/rust/plugins/word_count_processor.md
Normal file
44
docs/snippets/rust/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```rust title="Rust"
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct WordCountProcessor;
|
||||
|
||||
impl Plugin for WordCountProcessor {
|
||||
fn name(&self) -> &str { "word-count" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for WordCountProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
let word_count = result.content.split_whitespace().count();
|
||||
|
||||
result.processing_warnings.push(ProcessingWarning {
|
||||
source: "word-count".to_string(),
|
||||
message: format!("Processed with word count: {}", word_count)
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> bool {
|
||||
!result.content.is_empty()
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user