Files
fil/docs/snippets/elixir/plugins/plugin_extractor.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.1 KiB

Custom document extractor implementation is not available in the Elixir binding. Extractors must be implemented in Rust using the DocumentExtractor trait.

To implement a custom JSON extractor in Rust:

use kreuzberg::plugins::{DocumentExtractor, Plugin};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
use async_trait::async_trait;
use std::path::Path;

struct CustomJsonExtractor;

impl Plugin for CustomJsonExtractor {
    fn name(&self) -> &str { "custom-json-extractor" }
    fn version(&self) -> String { "1.0.0".to_string() }
    fn initialize(&self) -> Result<()> { Ok(()) }
    fn shutdown(&self) -> Result<()> { Ok(()) }
}

#[async_trait]
impl DocumentExtractor for CustomJsonExtractor {
    async fn extract_bytes(
        &self,
        content: &[u8],
        _mime_type: &str,
        _config: &ExtractionConfig,
    ) -> Result<ExtractionResult> {
        let json: serde_json::Value = serde_json::from_slice(content)?;
        let text = extract_text_from_json(&json);

        Ok(ExtractionResult {
            content: text,
            mime_type: "application/json".to_string(),
            ..Default::default()
        })
    }

    fn supported_mime_types(&self) -> &[&str] {
        &["application/json", "text/json"]
    }

    fn priority(&self) -> i32 { 50 }
}

fn extract_text_from_json(value: &serde_json::Value) -> String {
    match value {
        serde_json::Value::String(s) => format!("{}\n", s),
        serde_json::Value::Array(arr) => {
            arr.iter().map(extract_text_from_json).collect()
        }
        serde_json::Value::Object(obj) => {
            obj.values().map(extract_text_from_json).collect()
        }
        _ => String::new(),
    }
}

Register this in your Rust core and it will be available in Elixir.