Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,110 @@
```elixir title="Elixir"
# Define a stateful post-processor plugin using an Agent
defmodule MyApp.Plugins.StatefulTextProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
@moduledoc """
A stateful post-processor that maintains a count of processed documents.
Demonstrates how to use an Agent to store state across multiple processing calls.
"""
@impl true
def name, do: "stateful_text_processor"
@impl true
def version, do: "1.0.0"
@impl true
def processing_stage, do: :middle
# Start an Agent to maintain state
@impl true
def initialize do
case Agent.start_link(fn -> %{count: 0, errors: 0} end, name: __MODULE__) do
{:ok, _pid} -> :ok
{:error, {:already_started, _}} -> :ok
error -> error
end
end
@impl true
def shutdown do
case Agent.stop(__MODULE__) do
:ok -> :ok
error -> error
end
end
@impl true
def process(result, _config) do
# Increment the processed count
Agent.update(__MODULE__, fn state ->
%{state | count: state.count + 1}
end)
# Add metadata about processing
case normalize_content(result.content) do
{:ok, normalized} ->
Map.merge(result, %{
"content" => normalized,
"processed_count" => get_count(),
"processing_timestamp" => DateTime.utc_now() |> DateTime.to_iso8601()
})
{:error, reason} ->
Agent.update(__MODULE__, fn state ->
%{state | errors: state.errors + 1}
end)
{:error, "Failed to normalize content: #{reason}"}
end
end
# Retrieve the current processing count
defp get_count do
Agent.get(__MODULE__, fn state -> state.count end)
end
# Get error count
defp get_errors do
Agent.get(__MODULE__, fn state -> state.errors end)
end
# Normalize text content
defp normalize_content(content) when is_binary(content) do
{:ok,
content
|> String.trim()
|> String.replace(~r/\s+/, " ")}
end
defp normalize_content(_), do: {:error, "Content is not a string"}
end
# Register the stateful plugin
:ok = Kreuzberg.Plugin.register_post_processor(:stateful, MyApp.Plugins.StatefulTextProcessor)
# Initialize the plugin
:ok = MyApp.Plugins.StatefulTextProcessor.initialize()
# Process first document
result1 = %{"content" => " Example text with spaces "}
processed1 = MyApp.Plugins.StatefulTextProcessor.process(result1, nil)
IO.inspect(processed1, label: "First processing")
# Process second document
result2 = %{"content" => "Another document\nwith\tmultiple\nlines"}
processed2 = MyApp.Plugins.StatefulTextProcessor.process(result2, nil)
IO.inspect(processed2, label: "Second processing")
# The state persists across calls
IO.puts("Documents processed: #{processed2["processed_count"]}")
# Verify plugin is registered
{:ok, processors} = Kreuzberg.Plugin.list_post_processors()
IO.inspect(processors, label: "Registered processors")
# Cleanup
:ok = MyApp.Plugins.StatefulTextProcessor.shutdown()
:ok = Kreuzberg.Plugin.unregister_post_processor(:stateful)
```

View File

@@ -0,0 +1,189 @@
```elixir title="Elixir"
# Clear all plugins from the registry - useful for testing or resetting state
# Define multiple example plugins for demonstration
defmodule MyApp.Plugins.CleanupExample do
@moduledoc """
Example plugins to demonstrate clearing the registry.
"""
# Simple post-processor
defmodule TextCleaner do
@behaviour Kreuzberg.Plugin.PostProcessor
@impl true
def name, do: "text_cleaner"
@impl true
def version, do: "1.0.0"
@impl true
def processing_stage, do: :early
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def process(result, _config) do
Map.put(result, "cleaned", true)
end
end
# Simple validator
defmodule ContentValidator do
@behaviour Kreuzberg.Plugin.Validator
@impl true
def name, do: "content_validator"
@impl true
def version, do: "1.0.0"
@impl true
def priority, do: 50
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def should_validate?(%{"content" => content}) do
is_binary(content) and byte_size(content) > 0
end
def should_validate?(_), do: false
@impl true
def validate(%{"content" => content}) do
if String.length(content) > 0 do
:ok
else
{:error, "Content cannot be empty"}
end
end
def validate(_), do: {:error, "Missing content field"}
end
# Simple OCR backend
defmodule MockOCRBackend do
@behaviour Kreuzberg.Plugin.OcrBackend
@impl true
def name, do: "mock_ocr"
@impl true
def version, do: "1.0.0"
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def supported_languages, do: ["eng", "deu", "fra"]
@impl true
def process_image(_image_data, language) do
if language in supported_languages() do
{:ok, "OCR extracted text"}
else
{:error, "Unsupported language: #{language}"}
end
end
@impl true
def process_file(_path, language) do
if language in supported_languages() do
{:ok, "OCR extracted file text"}
else
{:error, "Unsupported language: #{language}"}
end
end
end
end
# Register multiple plugins
IO.puts("=== Registering Plugins ===")
:ok = Kreuzberg.Plugin.register_post_processor(:cleaner, MyApp.Plugins.CleanupExample.TextCleaner)
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.CleanupExample.ContentValidator)
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.CleanupExample.MockOCRBackend)
# List registered plugins before clearing
{:ok, post_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, validators} = Kreuzberg.Plugin.list_validators()
{:ok, ocr_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Before clearing:")
IO.puts(" Post-processors: #{length(post_procs)}")
IO.puts(" Validators: #{length(validators)}")
IO.puts(" OCR backends: #{length(ocr_backends)}")
# Clear post-processors
IO.puts("\n=== Clearing Post-Processors ===")
:ok = Kreuzberg.Plugin.clear_post_processors()
{:ok, post_procs_after} = Kreuzberg.Plugin.list_post_processors()
IO.puts("Post-processors after clearing: #{length(post_procs_after)}")
# Validators and OCR backends should still be registered
{:ok, validators_check} = Kreuzberg.Plugin.list_validators()
{:ok, ocr_backends_check} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Validators still registered: #{length(validators_check)}")
IO.puts("OCR backends still registered: #{length(ocr_backends_check)}")
# Clear validators
IO.puts("\n=== Clearing Validators ===")
:ok = Kreuzberg.Plugin.clear_validators()
{:ok, validators_after} = Kreuzberg.Plugin.list_validators()
IO.puts("Validators after clearing: #{length(validators_after)}")
# OCR backends should still be registered
{:ok, ocr_backends_check2} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("OCR backends still registered: #{length(ocr_backends_check2)}")
# Clear OCR backends
IO.puts("\n=== Clearing OCR Backends ===")
:ok = Kreuzberg.Plugin.clear_ocr_backends()
{:ok, ocr_backends_after} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("OCR backends after clearing: #{length(ocr_backends_after)}")
# Verify all are cleared
IO.puts("\n=== Final State (All Cleared) ===")
{:ok, final_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, final_validators} = Kreuzberg.Plugin.list_validators()
{:ok, final_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Post-processors: #{length(final_procs)}")
IO.puts("Validators: #{length(final_validators)}")
IO.puts("OCR backends: #{length(final_backends)}")
# Use case: Reset plugin state for testing
IO.puts("\n=== Common Use Case: Testing Setup/Teardown ===")
# Setup for test
Kreuzberg.Plugin.register_post_processor(:test_proc, MyApp.Plugins.CleanupExample.TextCleaner)
Kreuzberg.Plugin.register_validator(MyApp.Plugins.CleanupExample.ContentValidator)
# Run test
{:ok, test_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, test_vals} = Kreuzberg.Plugin.list_validators()
IO.puts("Test setup complete: #{length(test_procs)} processors, #{length(test_vals)} validators")
# Teardown - clear everything
Kreuzberg.Plugin.clear_post_processors()
Kreuzberg.Plugin.clear_validators()
Kreuzberg.Plugin.clear_ocr_backends()
# Verify clean state for next test
{:ok, clean_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, clean_vals} = Kreuzberg.Plugin.list_validators()
{:ok, clean_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Test teardown complete: #{length(clean_procs)} processors, #{length(clean_vals)} validators, #{length(clean_backends)} backends")
```

View File

@@ -0,0 +1,358 @@
```elixir title="Elixir"
# Unregister individual plugins from the registry
defmodule MyApp.Plugins.UnregisterExample do
@moduledoc """
Example plugins to demonstrate selective unregistration.
"""
# Email processor post-processor
defmodule EmailPostProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
@impl true
def name, do: "email_processor"
@impl true
def version, do: "1.0.0"
@impl true
def processing_stage, do: :middle
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def process(result, _config) do
# Extract emails from content
emails =
result
|> Map.get("content", "")
|> String.scan(~r/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/)
|> Enum.map(&List.first/1)
|> Enum.uniq()
Map.put(result, "extracted_emails", emails)
end
end
# Phone number processor post-processor
defmodule PhonePostProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
@impl true
def name, do: "phone_processor"
@impl true
def version, do: "1.0.0"
@impl true
def processing_stage, do: :middle
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def process(result, _config) do
# Extract phone numbers from content
phones =
result
|> Map.get("content", "")
|> String.scan(~r/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
|> Enum.map(&List.first/1)
|> Enum.uniq()
Map.put(result, "extracted_phones", phones)
end
end
# URL processor post-processor
defmodule URLPostProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
@impl true
def name, do: "url_processor"
@impl true
def version, do: "1.0.0"
@impl true
def processing_stage, do: :middle
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def process(result, _config) do
# Extract URLs from content
urls =
result
|> Map.get("content", "")
|> String.scan(~r/https?:\/\/\S+/)
|> Enum.map(&List.first/1)
|> Enum.uniq()
Map.put(result, "extracted_urls", urls)
end
end
# Strict length validator
defmodule StrictLengthValidator do
@behaviour Kreuzberg.Plugin.Validator
@impl true
def name, do: "strict_length_validator"
@impl true
def version, do: "1.0.0"
@impl true
def priority, do: 100
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def should_validate?(%{"content" => content}) do
is_binary(content)
end
def should_validate?(_), do: false
@impl true
def validate(%{"content" => content}) do
min_length = 10
max_length = 10000
cond do
byte_size(content) < min_length ->
{:error, "Content too short (minimum #{min_length} bytes)"}
byte_size(content) > max_length ->
{:error, "Content too long (maximum #{max_length} bytes)"}
true ->
:ok
end
end
def validate(_), do: {:error, "Missing content field"}
end
# Encoding validator
defmodule EncodingValidator do
@behaviour Kreuzberg.Plugin.Validator
@impl true
def name, do: "encoding_validator"
@impl true
def version, do: "1.0.0"
@impl true
def priority, do: 50
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def should_validate?(%{"content" => content}) do
is_binary(content)
end
def should_validate?(_), do: false
@impl true
def validate(%{"content" => content}) do
if String.valid?(content) do
:ok
else
{:error, "Content contains invalid UTF-8 encoding"}
end
end
def validate(_), do: {:error, "Missing content field"}
end
# Basic OCR backend
defmodule BasicOCR do
@behaviour Kreuzberg.Plugin.OcrBackend
@impl true
def name, do: "basic_ocr"
@impl true
def version, do: "1.0.0"
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def supported_languages, do: ["eng", "fra"]
@impl true
def process_image(_image_data, language) do
if language in supported_languages() do
{:ok, "Extracted text"}
else
{:error, "Unsupported language"}
end
end
@impl true
def process_file(_path, language) do
if language in supported_languages() do
{:ok, "Extracted file text"}
else
{:error, "Unsupported language"}
end
end
end
# Advanced OCR backend
defmodule AdvancedOCR do
@behaviour Kreuzberg.Plugin.OcrBackend
@impl true
def name, do: "advanced_ocr"
@impl true
def version, do: "2.0.0"
@impl true
def initialize, do: :ok
@impl true
def shutdown, do: :ok
@impl true
def supported_languages do
[
"eng",
"fra",
"deu",
"spa",
"ita",
"jpn",
"chi",
"chi_tra",
"kor"
]
end
@impl true
def process_image(_image_data, language) do
if language in supported_languages() do
{:ok, "Advanced extracted text"}
else
{:error, "Unsupported language"}
end
end
@impl true
def process_file(_path, language) do
if language in supported_languages() do
{:ok, "Advanced extracted file text"}
else
{:error, "Unsupported language"}
end
end
end
end
IO.puts("=== Plugin Unregistration Example ===\n")
# Register multiple plugins of each type
IO.puts("Registering plugins...")
:ok = Kreuzberg.Plugin.register_post_processor(:emails, MyApp.Plugins.UnregisterExample.EmailPostProcessor)
:ok = Kreuzberg.Plugin.register_post_processor(:phones, MyApp.Plugins.UnregisterExample.PhonePostProcessor)
:ok = Kreuzberg.Plugin.register_post_processor(:urls, MyApp.Plugins.UnregisterExample.URLPostProcessor)
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.UnregisterExample.StrictLengthValidator)
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.UnregisterExample.EncodingValidator)
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.UnregisterExample.BasicOCR)
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.UnregisterExample.AdvancedOCR)
# List all registered plugins
{:ok, procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, vals} = Kreuzberg.Plugin.list_validators()
{:ok, backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Initial registration:")
IO.puts(" Post-processors: #{length(procs)} - #{inspect(Enum.map(procs, &elem(&1, 0)))}")
IO.puts(" Validators: #{length(vals)} - #{inspect(Enum.map(vals, &(elem(&1, :__struct__) || &1.name())))}")
IO.puts(" OCR backends: #{length(backends)} - #{inspect(Enum.map(backends, &(elem(&1, :__struct__) || &1.name())))}\n")
# Unregister individual post-processor
IO.puts("Unregistering post-processor ':phones'...")
:ok = Kreuzberg.Plugin.unregister_post_processor(:phones)
{:ok, procs_after1} = Kreuzberg.Plugin.list_post_processors()
IO.puts("Post-processors: #{length(procs_after1)} - #{inspect(Enum.map(procs_after1, &elem(&1, 0)))}\n")
# Unregister another post-processor
IO.puts("Unregistering post-processor ':urls'...")
:ok = Kreuzberg.Plugin.unregister_post_processor(:urls)
{:ok, procs_after2} = Kreuzberg.Plugin.list_post_processors()
IO.puts("Post-processors: #{length(procs_after2)} - #{inspect(Enum.map(procs_after2, &elem(&1, 0)))}\n")
# Unregister a validator
IO.puts("Unregistering validator 'EncodingValidator'...")
:ok = Kreuzberg.Plugin.unregister_validator(MyApp.Plugins.UnregisterExample.EncodingValidator)
{:ok, vals_after} = Kreuzberg.Plugin.list_validators()
IO.puts("Validators: #{length(vals_after)}\n")
# Unregister an OCR backend
IO.puts("Unregistering OCR backend 'BasicOCR'...")
:ok = Kreuzberg.Plugin.unregister_ocr_backend(MyApp.Plugins.UnregisterExample.BasicOCR)
{:ok, backends_after} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("OCR backends: #{length(backends_after)}\n")
# Idempotent unregistration - unregistering non-existent plugin
IO.puts("Unregistering already-unregistered plugin ':phones' (idempotent)...")
:ok = Kreuzberg.Plugin.unregister_post_processor(:phones)
IO.puts("Still returns :ok\n")
# Final state
IO.puts("=== Final State ===")
{:ok, final_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, final_vals} = Kreuzberg.Plugin.list_validators()
{:ok, final_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Remaining post-processors: #{length(final_procs)}")
IO.puts("Remaining validators: #{length(final_vals)}")
IO.puts("Remaining OCR backends: #{length(final_backends)}\n")
# Cleanup - unregister remaining plugins
IO.puts("=== Cleanup ===")
:ok = Kreuzberg.Plugin.unregister_post_processor(:emails)
:ok = Kreuzberg.Plugin.unregister_validator(MyApp.Plugins.UnregisterExample.StrictLengthValidator)
:ok = Kreuzberg.Plugin.unregister_ocr_backend(MyApp.Plugins.UnregisterExample.AdvancedOCR)
{:ok, final_clean_procs} = Kreuzberg.Plugin.list_post_processors()
{:ok, final_clean_vals} = Kreuzberg.Plugin.list_validators()
{:ok, final_clean_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("After cleanup:")
IO.puts(" Post-processors: #{length(final_clean_procs)}")
IO.puts(" Validators: #{length(final_clean_vals)}")
IO.puts(" OCR backends: #{length(final_clean_backends)}")
```

View File

@@ -0,0 +1,15 @@
```elixir title="Elixir"
# Clear all document extractors from the global registry
{:ok, _} = Kreuzberg.clear_document_extractors()
# Clear all OCR backends from the global registry
{:ok, _} = Kreuzberg.clear_ocr_backends()
# Clear all post-processors from the global registry
{:ok, _} = Kreuzberg.clear_post_processors()
# Clear all validators from the global registry
{:ok, _} = Kreuzberg.clear_validators()
IO.puts("All plugins cleared")
```

View File

@@ -0,0 +1,10 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Embedding backend registration is not available in the Elixir binding. Custom embedding backends must be implemented in Rust using the `EmbeddingBackend` trait and registered in the Rust core before being used by Elixir.
To use custom embeddings in Elixir:
1. Implement the embedding backend in Rust (in `crates/kreuzberg/src/plugins/embedding.rs` or a separate Rust crate)
2. Register the backend in the Rust core initialization
3. Call the embeddings functions from Elixir with the appropriate config
See the Rust plugin documentation for implementing custom `EmbeddingBackend` plugins.

View File

@@ -0,0 +1,61 @@
```elixir title="Elixir"
defmodule MyApp.MetadataEnricher do
@behaviour Kreuzberg.Plugin.PostProcessor
def process(result, _config) do
enriched_metadata = result.metadata || %{}
enriched_metadata = Map.put(enriched_metadata, "enriched_at", DateTime.utc_now())
{:ok, %{result | metadata: enriched_metadata}}
end
def initialize, do: :ok
def shutdown, do: :ok
def version, do: "1.0.0"
end
defmodule MyApp.LinkExtractor do
@behaviour Kreuzberg.Plugin.PostProcessor
def process(result, _config) do
links = extract_links(result.content)
metadata = result.metadata || %{}
metadata = Map.put(metadata, "links", links)
{:ok, %{result | metadata: metadata}}
end
defp extract_links(content) do
Regex.scan(~r/https?:\/\/\S+/, content)
|> Enum.map(&List.first/1)
|> Enum.uniq()
end
def initialize, do: :ok
def shutdown, do: :ok
def version, do: "1.0.0"
end
defmodule MyApp.QualityValidator do
@behaviour Kreuzberg.Plugin.Validator
def validate(result) do
if String.length(result.content) > 100 do
:ok
else
{:error, "Content quality too low"}
end
end
def should_validate?(_result), do: true
def priority, do: 5
def initialize, do: :ok
def shutdown, do: :ok
def version, do: "1.0.0"
end
# Register multiple plugins
Kreuzberg.Plugin.register_post_processor(:metadata_enricher, MyApp.MetadataEnricher)
Kreuzberg.Plugin.register_post_processor(:link_extractor, MyApp.LinkExtractor)
Kreuzberg.Plugin.register_validator(MyApp.QualityValidator)
IO.puts("Plugins registered successfully")
```

View File

@@ -0,0 +1,10 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Document extractor registration is not available in the Elixir binding. Custom extractors must be implemented in Rust using the `DocumentExtractor` trait and registered in the Rust core.
To use custom extractors in Elixir:
1. Implement the extractor in Rust using the `DocumentExtractor` trait
2. Register the extractor in the Rust core's registry
3. Call the extraction functions from Elixir
See the Rust plugin documentation for implementing custom `DocumentExtractor` plugins.

View File

@@ -0,0 +1,23 @@
```elixir title="Elixir"
# List all registered plugins
{:ok, post_processors} = Kreuzberg.Plugin.list_post_processors()
{:ok, validators} = Kreuzberg.Plugin.list_validators()
{:ok, ocr_backends} = Kreuzberg.Plugin.list_ocr_backends()
IO.puts("Post-processors:")
Enum.each(post_processors, fn {name, module} ->
IO.puts(" - #{name}: #{module}")
end)
IO.puts("\nValidators:")
Enum.each(validators, fn module ->
IO.puts(" - #{module}")
end)
IO.puts("\nOCR backends:")
Enum.each(ocr_backends, fn module ->
IO.puts(" - #{module}")
end)
IO.puts("\nTotal: #{length(post_processors)} post-processors, #{length(validators)} validators, #{length(ocr_backends)} OCR backends")
```

View File

@@ -0,0 +1,17 @@
```elixir title="Elixir"
# List all registered document extractors
{:ok, extractors} = Kreuzberg.list_document_extractors()
IO.inspect(extractors, label: "Document extractors")
# List all registered OCR backends
{:ok, backends} = Kreuzberg.list_ocr_backends()
IO.inspect(backends, label: "OCR backends")
# List all registered post-processors
{:ok, processors} = Kreuzberg.list_post_processors()
IO.inspect(processors, label: "Post-processors")
# List all registered validators
{:ok, validators} = Kreuzberg.list_validators()
IO.inspect(validators, label: "Validators")
```

View File

@@ -0,0 +1,38 @@
```elixir title="Elixir"
defmodule MinLengthValidator do
@behaviour Kreuzberg.Plugin.Validator
@min_length 50
def name, do: "min_length_validator"
def validate(result) do
if String.length(result.content) >= @min_length do
:ok
else
{:error, "Content too short"}
end
end
def should_validate?(_result), do: true
def priority, do: 1
def initialize, do: :ok
def shutdown, do: :ok
def version, do: "1.0.0"
end
# Register the validator
Kreuzberg.Plugin.register_validator(MinLengthValidator)
# Example usage with extraction
{:ok, result} = Kreuzberg.extract_file(
"document.pdf",
nil
)
case result do
result ->
IO.puts("Extraction successful")
IO.puts("Content length: #{String.length(result.content)}")
end
```

View File

@@ -0,0 +1,45 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom validator implementation is not available in the Elixir binding. Validators must be implemented in Rust using the `Validator` trait.
To implement a minimum length validator in Rust:
```rust
use kreuzberg::plugins::{Plugin, Validator};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
use async_trait::async_trait;
struct MinLengthValidator {
min_length: usize,
}
impl Plugin for MinLengthValidator {
fn name(&self) -> &str { "min-length-validator" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl Validator for MinLengthValidator {
async fn validate(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
if result.content.len() < self.min_length {
return Err(KreuzbergError::validation(format!(
"Content too short: {} < {} characters",
result.content.len(),
self.min_length
)));
}
Ok(())
}
fn priority(&self) -> i32 {
100
}
}
```
Register this in your Rust initialization and Elixir will be able to use it.

View File

@@ -0,0 +1,11 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom PDF metadata extractor implementation is not available in the Elixir binding. Document extractors must be implemented in Rust using the `DocumentExtractor` trait.
To implement a custom PDF metadata extractor in Rust:
1. Implement the `Plugin` and `DocumentExtractor` traits
2. Add support for PDF MIME types: `application/pdf`
3. Use a PDF library (e.g., pdfium-render, pdf crate) to extract metadata
4. Register the extractor in the Rust core
See the Rust plugin documentation for implementing custom `DocumentExtractor` plugins for PDF files.

View File

@@ -0,0 +1,97 @@
```elixir title="Elixir"
alias Kreuzberg.Plugin
# PDF-Only Post-Processor Plugin
# This conditional post-processor only processes PDF files.
# It can be used to apply PDF-specific transformations.
defmodule MyApp.Plugins.PdfOnlyProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
require Logger
@impl true
def name do
"PdfOnlyProcessor"
end
@impl true
def processing_stage do
:post
end
@impl true
def version do
"1.0.0"
end
@impl true
def initialize do
:ok
end
@impl true
def shutdown do
:ok
end
@impl true
def process(result, _options) do
mime_type = result["mime_type"] || ""
if mime_type == "application/pdf" do
# Process only for PDF files
Logger.info("Processing PDF content with custom PDF processor")
# Example: Extract PDF-specific metadata or transform content
content = result["content"] || ""
# Add PDF processing marker to metadata
metadata = Map.get(result, "metadata", %{})
updated_metadata = Map.put(metadata, "processed_by_pdf_processor", true)
{:ok, Map.put(result, "metadata", updated_metadata)}
else
{:ok, result}
end
end
end
# Register the PDF-only post-processor
Plugin.register_post_processor(:pdf_only_processor, MyApp.Plugins.PdfOnlyProcessor)
# Example usage with PDF result
pdf_result = %{
"content" => "PDF extracted content here",
"mime_type" => "application/pdf",
"metadata" => %{
"source" => "document.pdf",
"pages" => 5
}
}
# Process PDF result
case MyApp.Plugins.PdfOnlyProcessor.process(pdf_result, %{}) do
{:ok, processed_result} ->
IO.puts("PDF processing complete")
IO.inspect(processed_result, label: "PDF Result")
{:error, reason} ->
IO.puts("PDF processing failed: #{reason}")
end
# Example with non-PDF result (processor will skip processing)
non_pdf_result = %{
"content" => "Image extracted content",
"mime_type" => "image/png",
"metadata" => %{}
}
case MyApp.Plugins.PdfOnlyProcessor.process(non_pdf_result, %{}) do
{:ok, processed_result} ->
IO.puts("Processing complete (skipped for non-PDF)")
IO.inspect(processed_result, label: "Non-PDF Result")
{:error, reason} ->
IO.puts("Processing failed: #{reason}")
end
```

View File

@@ -0,0 +1,41 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom post-processor implementation is not available in the Elixir binding. Post-processors must be implemented in Rust using the `PostProcessor` trait.
To implement a PDF-only post-processor in Rust:
```rust
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
struct PdfOnlyProcessor;
impl Plugin for PdfOnlyProcessor {
fn name(&self) -> &str { "pdf-only" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl PostProcessor for PdfOnlyProcessor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig
) -> Result<()> {
// Custom processing logic for PDF documents
Ok(())
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig
) -> bool {
result.mime_type == "application/pdf"
}
}
```
Register this in your Rust initialization and Elixir will use it automatically during extraction.

View File

@@ -0,0 +1,35 @@
```elixir title="Elixir"
defmodule MyApp.WordCountProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
def name, do: "word_count_processor"
def version, do: "1.0.0"
def processing_stage, do: :late
def initialize, do: :ok
def shutdown, do: :ok
def process(result, _config) do
word_count = result.content
|> String.split()
|> Enum.count()
metadata = Map.put(result.metadata || %{}, "word_count", word_count)
%{result | metadata: metadata}
end
end
# Register post-processor
Kreuzberg.Plugin.register_post_processor(MyApp.WordCountProcessor)
# Use with extraction
{:ok, result} = Kreuzberg.extract_file(
"document.pdf",
nil
)
IO.puts("Word count: #{result.metadata["word_count"]}")
```

View File

@@ -0,0 +1,60 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom document extractor implementation is not available in the Elixir binding. Extractors must be implemented in Rust using the `DocumentExtractor` trait.
To implement a custom JSON extractor in Rust:
```rust
use kreuzberg::plugins::{DocumentExtractor, Plugin};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
use async_trait::async_trait;
use std::path::Path;
struct CustomJsonExtractor;
impl Plugin for CustomJsonExtractor {
fn name(&self) -> &str { "custom-json-extractor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl DocumentExtractor for CustomJsonExtractor {
async fn extract_bytes(
&self,
content: &[u8],
_mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let json: serde_json::Value = serde_json::from_slice(content)?;
let text = extract_text_from_json(&json);
Ok(ExtractionResult {
content: text,
mime_type: "application/json".to_string(),
..Default::default()
})
}
fn supported_mime_types(&self) -> &[&str] {
&["application/json", "text/json"]
}
fn priority(&self) -> i32 { 50 }
}
fn extract_text_from_json(value: &serde_json::Value) -> String {
match value {
serde_json::Value::String(s) => format!("{}\n", s),
serde_json::Value::Array(arr) => {
arr.iter().map(extract_text_from_json).collect()
}
serde_json::Value::Object(obj) => {
obj.values().map(extract_text_from_json).collect()
}
_ => String::new(),
}
}
```
Register this in your Rust core and it will be available in Elixir.

View File

@@ -0,0 +1,125 @@
```elixir title="Elixir"
alias Kreuzberg.Plugin
require Logger
# Plugin with Logging
# This example demonstrates best practices for logging in plugins.
# Proper logging helps with debugging and monitoring plugin execution.
defmodule MyApp.Plugins.LoggingProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
require Logger
@impl true
def name do
"LoggingProcessor"
end
@impl true
def processing_stage do
:post
end
@impl true
def version do
"1.0.0"
end
@impl true
def initialize do
:ok
end
@impl true
def shutdown do
:ok
end
@impl true
def process(result, _options) do
try do
Logger.debug("Starting content processing", plugin: "LoggingProcessor")
content = result["content"] || ""
content_size = byte_size(content)
mime_type = result["mime_type"] || "unknown"
Logger.info("Processing extraction result",
mime_type: mime_type,
content_size: content_size
)
# Perform processing
processed_content = clean_content(content)
metadata = Map.get(result, "metadata", %{})
updated_metadata = metadata
|> Map.put("processed_at", DateTime.utc_now())
|> Map.put("original_size", content_size)
|> Map.put("processed_size", byte_size(processed_content))
Logger.debug("Processing complete",
original_size: content_size,
processed_size: byte_size(processed_content)
)
{:ok, Map.put(result, "metadata", updated_metadata)}
rescue
error ->
Logger.error("Processing error in LoggingProcessor",
error: inspect(error),
stacktrace: __STACKTRACE__
)
{:error, "Processing failed: #{inspect(error)}"}
end
end
defp clean_content(content) do
content
|> String.trim()
|> String.replace(~r/\s+/, " ")
end
end
# Register the logging processor
Plugin.register_post_processor(:logging_processor, MyApp.Plugins.LoggingProcessor)
# Configure logging (use :info or :debug for verbosity)
Logger.configure(level: :debug)
IO.puts("=== Plugin Logging Example ===\n")
# Example usage
result = %{
"content" => " Sample document with irregular spacing ",
"mime_type" => "application/pdf",
"metadata" => %{"source" => "document.pdf"}
}
Logger.info("Starting extraction processing example")
case MyApp.Plugins.LoggingProcessor.process(result, %{}) do
{:ok, processed_result} ->
IO.puts("\nProcessing succeeded!")
IO.inspect(processed_result, label: "Processed Result")
{:error, reason} ->
IO.puts("Processing failed: #{reason}")
end
# Demonstrate error handling with logging
Logger.info("Testing error handling")
invalid_result = %{
"content" => nil,
"mime_type" => "application/pdf"
}
case MyApp.Plugins.LoggingProcessor.process(invalid_result, %{}) do
{:ok, _processed_result} ->
IO.puts("Processing succeeded")
{:error, reason} ->
IO.puts("Processing failed as expected: #{reason}")
end
```

View File

@@ -0,0 +1,27 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Plugin logging is not directly available in the Elixir binding. Logging must be implemented in the Rust plugin code itself using the `tracing` crate.
To add logging to a Rust plugin:
```rust
use tracing::{debug, info, warn};
#[async_trait]
impl DocumentExtractor for MyExtractor {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
debug!("Starting extraction for {}", mime_type);
// Extraction logic...
info!("Extraction completed for {}", mime_type);
Ok(result)
}
}
```
The logs will be captured by Kreuzberg's tracing infrastructure and can be monitored from Elixir through structured logging in the output.

View File

@@ -0,0 +1,157 @@
```elixir title="Elixir"
alias Kreuzberg.Plugin
# Plugin Testing Example
# This demonstrates how to test custom plugins with various scenarios.
defmodule MyApp.Plugins.CustomJsonExtractor do
@behaviour Kreuzberg.Plugin.PostProcessor
require Logger
@impl true
def name do
"CustomJsonExtractor"
end
@impl true
def processing_stage do
:post
end
@impl true
def version do
"1.0.0"
end
@impl true
def initialize do
:ok
end
@impl true
def shutdown do
:ok
end
@impl true
def process(result, _options) do
mime_type = result["mime_type"] || ""
if mime_type == "application/json" do
Logger.debug("Processing JSON content")
{:ok, Map.put(result, "is_json", true)}
else
Logger.debug("Not JSON content, skipping")
{:ok, result}
end
end
end
# Test setup
defmodule MyApp.Plugins.Test do
require Logger
def run_tests do
IO.puts("\n=== Running Plugin Tests ===\n")
test_json_processing()
test_non_json_processing()
test_empty_content()
test_missing_mime_type()
IO.puts("\n=== All Tests Completed ===\n")
end
defp test_json_processing do
IO.puts("Test 1: JSON Content Processing")
result = %{
"content" => ~s({"message": "Hello, world!"}),
"mime_type" => "application/json",
"metadata" => %{}
}
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
{:ok, processed} ->
if processed["is_json"] == true do
IO.puts(" PASS: JSON content marked correctly\n")
else
IO.puts(" FAIL: JSON flag not set\n")
end
{:error, reason} ->
IO.puts(" FAIL: #{reason}\n")
end
end
defp test_non_json_processing do
IO.puts("Test 2: Non-JSON Content Processing")
result = %{
"content" => "Plain text content",
"mime_type" => "text/plain",
"metadata" => %{}
}
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
{:ok, processed} ->
if not Map.has_key?(processed, "is_json") or !processed["is_json"] do
IO.puts(" PASS: Non-JSON content not marked\n")
else
IO.puts(" FAIL: Non-JSON content incorrectly marked as JSON\n")
end
{:error, reason} ->
IO.puts(" FAIL: #{reason}\n")
end
end
defp test_empty_content do
IO.puts("Test 3: Empty Content")
result = %{
"content" => "",
"mime_type" => "application/json",
"metadata" => %{}
}
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
{:ok, _processed} ->
IO.puts(" PASS: Empty content handled gracefully\n")
{:error, reason} ->
IO.puts(" FAIL: #{reason}\n")
end
end
defp test_missing_mime_type do
IO.puts("Test 4: Missing MIME Type")
result = %{
"content" => "Some content",
"metadata" => %{}
}
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
{:ok, _processed} ->
IO.puts(" PASS: Missing MIME type handled gracefully\n")
{:error, reason} ->
IO.puts(" FAIL: #{reason}\n")
end
end
end
# Register the custom plugin
Plugin.register_post_processor(:custom_json_extractor, MyApp.Plugins.CustomJsonExtractor)
# Run the test suite
MyApp.Plugins.Test.run_tests()
# List all registered post-processors
{:ok, processors} = Plugin.list_post_processors()
IO.puts("Registered Post-Processors:")
Enum.each(processors, fn {name, module} ->
IO.puts(" - #{name}: #{module}")
end)
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Plugin testing in Elixir is limited since custom plugins cannot be implemented in Elixir. Plugin testing should be done in Rust using `#[cfg(test)]` test modules.
When testing custom plugins in Rust:
```rust
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_custom_extractor() {
let extractor = MyExtractor;
let content = b"test content";
let result = extractor.extract_bytes(content, "text/plain", &ExtractionConfig::default()).await;
assert!(result.is_ok());
}
}
```
For Elixir, you can test the extraction results using the built-in functions like `Kreuzberg.extract_bytes_async/3` and `Kreuzberg.extract_file_async/3`.

View File

@@ -0,0 +1,32 @@
```elixir title="Elixir"
defmodule MyApp.MinLengthValidator do
@behaviour Kreuzberg.Plugin.Validator
def name, do: "min_length_validator"
def validate(result) do
if String.length(result.content) >= 50 do
:ok
else
{:error, "Content too short: #{String.length(result.content)} chars"}
end
end
def should_validate?(_result), do: true
def priority, do: 10
def initialize, do: :ok
def shutdown, do: :ok
def version, do: "1.0.0"
end
# Register validator
Kreuzberg.Plugin.register_validator(MyApp.MinLengthValidator)
# Use with extraction
{:ok, result} = Kreuzberg.extract_file(
"document.pdf",
nil
)
IO.puts("Content length: #{String.length(result.content)}")
```

View File

@@ -0,0 +1,10 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom validator implementation is not available in the Elixir binding. Validators must be implemented in Rust using the `Validator` trait.
To implement a custom validator in Rust and use it from Elixir:
1. Implement the `Plugin` and `Validator` traits in Rust
2. Register the validator in the Rust core
3. Call extraction functions from Elixir, which will automatically apply registered validators
The validator will run after extraction completes and can reject results that don't meet validation criteria.

View File

@@ -0,0 +1,75 @@
```elixir title="Elixir"
alias Kreuzberg.Plugin
# Quality Score Validator Plugin
# This validator ensures extracted content meets a minimum quality threshold.
# It checks the quality_score metadata field and rejects low-quality extractions.
defmodule MyApp.Plugins.QualityScoreValidator do
@behaviour Kreuzberg.Plugin.Validator
require Logger
@impl true
def name do
"QualityScoreValidator"
end
@impl true
def initialize do
:ok
end
@impl true
def shutdown do
:ok
end
@impl true
def priority do
100
end
@impl true
def should_validate?(result) do
true
end
@impl true
def validate(result) do
# Extract quality score from ExtractionResult struct
quality_score = result.quality_score
if is_number(quality_score) and quality_score >= 0.5 do
:ok
else
{:error, "Quality score too low: #{Float.round(quality_score, 2)}. Minimum required: 0.5"}
end
end
end
# Register the quality validator plugin
Plugin.register_validator(MyApp.Plugins.QualityScoreValidator)
# Example usage with extraction
# Note: In real usage, result will be an ExtractionResult struct, not a map.
# This example shows the data structure for illustration purposes.
result = %{
"content" => "Extracted document content",
"metadata" => %{
"quality_score" => 0.85,
"pages" => 1
}
}
case MyApp.Plugins.QualityScoreValidator.validate(result) do
:ok ->
IO.puts("Quality validation passed: #{result["metadata"]["quality_score"]}")
{:error, reason} ->
IO.puts("Quality validation failed: #{reason}")
end
# List all registered validators
{:ok, validators} = Plugin.list_validators()
IO.inspect(validators, label: "Registered Validators")
```

View File

@@ -0,0 +1,48 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Quality score validator implementation is not available in the Elixir binding. Custom validators must be implemented in Rust using the `Validator` trait.
To implement a quality score validator in Rust:
```rust
use kreuzberg::plugins::{Plugin, Validator};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
use async_trait::async_trait;
struct QualityScoreValidator {
min_score: f32,
}
impl Plugin for QualityScoreValidator {
fn name(&self) -> &str { "quality-validator" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl Validator for QualityScoreValidator {
async fn validate(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig,
) -> Result<()> {
let quality = calculate_quality_score(result);
if quality < self.min_score {
return Err(KreuzbergError::validation(format!(
"Quality score too low: {} < {}",
quality, self.min_score
)));
}
Ok(())
}
fn priority(&self) -> i32 { 50 }
}
fn calculate_quality_score(result: &ExtractionResult) -> f32 {
// Implement quality scoring logic
0.8
}
```
Register this validator in Rust and Elixir will use it automatically.

View File

@@ -0,0 +1,45 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Stateful plugin implementation is not available in the Elixir binding. Stateful plugins must be implemented in Rust using `Arc<Mutex<>>` or `Arc<RwLock<>>` for thread-safe state management.
To implement a stateful plugin in Rust:
```rust
use kreuzberg::plugins::{Plugin, PostProcessor};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use std::sync::{Arc, Mutex};
use async_trait::async_trait;
struct StatefulProcessor {
state: Arc<Mutex<ProcessorState>>,
}
struct ProcessorState {
extraction_count: usize,
}
impl Plugin for StatefulProcessor {
fn name(&self) -> &str { "stateful-processor" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> {
let mut state = self.state.lock().unwrap();
state.extraction_count = 0;
Ok(())
}
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl PostProcessor for StatefulProcessor {
async fn process(
&self,
_result: &mut ExtractionResult,
_config: &ExtractionConfig
) -> Result<()> {
let mut state = self.state.lock().unwrap();
state.extraction_count += 1;
Ok(())
}
}
```
Register this in Rust and use it from Elixir.

View File

@@ -0,0 +1,20 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Plugin unregistration is not available in the Elixir binding. Plugin unregistration must be done in Rust using the registry APIs.
To unregister a specific plugin in Rust:
```rust
use kreuzberg::plugins::registry::get_document_extractor_registry;
let registry = get_document_extractor_registry();
registry.remove("custom-json-extractor")?;
```
In Elixir, you can only clear all plugins of a specific type using:
- `Kreuzberg.clear_document_extractors()`
- `Kreuzberg.clear_post_processors()`
- `Kreuzberg.clear_ocr_backends()`
- `Kreuzberg.clear_validators()`
To remove a single plugin, you must do so from the Rust core before Elixir starts using it.

View File

@@ -0,0 +1,77 @@
```elixir title="Elixir"
alias Kreuzberg.Plugin
# Word Count Post-Processor Plugin
# This post-processor automatically counts words in extracted content
# and adds the word count to the metadata.
defmodule MyApp.Plugins.WordCountProcessor do
@behaviour Kreuzberg.Plugin.PostProcessor
require Logger
@impl true
def name do
"WordCountProcessor"
end
@impl true
def processing_stage do
:post
end
@impl true
def version do
"1.0.0"
end
@impl true
def initialize do
:ok
end
@impl true
def shutdown do
:ok
end
@impl true
def process(result, _options) do
content = result["content"] || ""
word_count = content
|> String.split(~r/\s+/, trim: true)
|> length()
# Update metadata with word count
metadata = Map.get(result, "metadata", %{})
updated_metadata = Map.put(metadata, "word_count", word_count)
{:ok, Map.put(result, "metadata", updated_metadata)}
end
end
# Register the word count post-processor
Plugin.register_post_processor(:word_count_processor, MyApp.Plugins.WordCountProcessor)
# Example usage
result = %{
"content" => "The quick brown fox jumps over the lazy dog. This is a sample document with multiple words.",
"metadata" => %{
"source" => "document.pdf",
"pages" => 1
}
}
case MyApp.Plugins.WordCountProcessor.process(result, %{}) do
{:ok, processed_result} ->
word_count = processed_result["metadata"]["word_count"]
IO.puts("Word count added: #{word_count} words")
IO.inspect(processed_result, label: "Processed Result")
{:error, reason} ->
IO.puts("Processing failed: #{reason}")
end
# List all registered post-processors
{:ok, processors} = Plugin.list_post_processors()
IO.inspect(processors, label: "Registered Post-Processors")
```

View File

@@ -0,0 +1,46 @@
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
Custom post-processor implementation is not available in the Elixir binding. Post-processors must be implemented in Rust using the `PostProcessor` trait.
To implement a word count processor in Rust:
```rust
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
use async_trait::async_trait;
struct WordCountProcessor;
impl Plugin for WordCountProcessor {
fn name(&self) -> &str { "word-count" }
fn version(&self) -> String { "1.0.0".to_string() }
fn initialize(&self) -> Result<()> { Ok(()) }
fn shutdown(&self) -> Result<()> { Ok(()) }
}
#[async_trait]
impl PostProcessor for WordCountProcessor {
async fn process(
&self,
result: &mut ExtractionResult,
_config: &ExtractionConfig
) -> Result<()> {
let word_count = result.content.split_whitespace().count();
// Store word count in metadata or processing warnings
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(
&self,
result: &ExtractionResult,
_config: &ExtractionConfig
) -> bool {
!result.content.is_empty()
}
}
```
Register this processor in Rust and it will be applied during extraction in Elixir.