61 lines
2.1 KiB
Markdown
61 lines
2.1 KiB
Markdown
|
|
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||
|
|
Custom document extractor implementation is not available in the Elixir binding. Extractors must be implemented in Rust using the `DocumentExtractor` trait.
|
||
|
|
|
||
|
|
To implement a custom JSON extractor in Rust:
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use kreuzberg::plugins::{DocumentExtractor, Plugin};
|
||
|
|
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
|
||
|
|
use async_trait::async_trait;
|
||
|
|
use std::path::Path;
|
||
|
|
|
||
|
|
struct CustomJsonExtractor;
|
||
|
|
|
||
|
|
impl Plugin for CustomJsonExtractor {
|
||
|
|
fn name(&self) -> &str { "custom-json-extractor" }
|
||
|
|
fn version(&self) -> String { "1.0.0".to_string() }
|
||
|
|
fn initialize(&self) -> Result<()> { Ok(()) }
|
||
|
|
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||
|
|
}
|
||
|
|
|
||
|
|
#[async_trait]
|
||
|
|
impl DocumentExtractor for CustomJsonExtractor {
|
||
|
|
async fn extract_bytes(
|
||
|
|
&self,
|
||
|
|
content: &[u8],
|
||
|
|
_mime_type: &str,
|
||
|
|
_config: &ExtractionConfig,
|
||
|
|
) -> Result<ExtractionResult> {
|
||
|
|
let json: serde_json::Value = serde_json::from_slice(content)?;
|
||
|
|
let text = extract_text_from_json(&json);
|
||
|
|
|
||
|
|
Ok(ExtractionResult {
|
||
|
|
content: text,
|
||
|
|
mime_type: "application/json".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
fn supported_mime_types(&self) -> &[&str] {
|
||
|
|
&["application/json", "text/json"]
|
||
|
|
}
|
||
|
|
|
||
|
|
fn priority(&self) -> i32 { 50 }
|
||
|
|
}
|
||
|
|
|
||
|
|
fn extract_text_from_json(value: &serde_json::Value) -> String {
|
||
|
|
match value {
|
||
|
|
serde_json::Value::String(s) => format!("{}\n", s),
|
||
|
|
serde_json::Value::Array(arr) => {
|
||
|
|
arr.iter().map(extract_text_from_json).collect()
|
||
|
|
}
|
||
|
|
serde_json::Value::Object(obj) => {
|
||
|
|
obj.values().map(extract_text_from_json).collect()
|
||
|
|
}
|
||
|
|
_ => String::new(),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
Register this in your Rust core and it will be available in Elixir.
|