This commit is contained in:
110
docs/snippets/elixir/plugins/11_stateful_plugin.exs
Normal file
110
docs/snippets/elixir/plugins/11_stateful_plugin.exs
Normal file
@@ -0,0 +1,110 @@
|
||||
```elixir title="Elixir"
|
||||
# Define a stateful post-processor plugin using an Agent
|
||||
defmodule MyApp.Plugins.StatefulTextProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
@moduledoc """
|
||||
A stateful post-processor that maintains a count of processed documents.
|
||||
Demonstrates how to use an Agent to store state across multiple processing calls.
|
||||
"""
|
||||
|
||||
@impl true
|
||||
def name, do: "stateful_text_processor"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def processing_stage, do: :middle
|
||||
|
||||
# Start an Agent to maintain state
|
||||
@impl true
|
||||
def initialize do
|
||||
case Agent.start_link(fn -> %{count: 0, errors: 0} end, name: __MODULE__) do
|
||||
{:ok, _pid} -> :ok
|
||||
{:error, {:already_started, _}} -> :ok
|
||||
error -> error
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
case Agent.stop(__MODULE__) do
|
||||
:ok -> :ok
|
||||
error -> error
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _config) do
|
||||
# Increment the processed count
|
||||
Agent.update(__MODULE__, fn state ->
|
||||
%{state | count: state.count + 1}
|
||||
end)
|
||||
|
||||
# Add metadata about processing
|
||||
case normalize_content(result.content) do
|
||||
{:ok, normalized} ->
|
||||
Map.merge(result, %{
|
||||
"content" => normalized,
|
||||
"processed_count" => get_count(),
|
||||
"processing_timestamp" => DateTime.utc_now() |> DateTime.to_iso8601()
|
||||
})
|
||||
|
||||
{:error, reason} ->
|
||||
Agent.update(__MODULE__, fn state ->
|
||||
%{state | errors: state.errors + 1}
|
||||
end)
|
||||
|
||||
{:error, "Failed to normalize content: #{reason}"}
|
||||
end
|
||||
end
|
||||
|
||||
# Retrieve the current processing count
|
||||
defp get_count do
|
||||
Agent.get(__MODULE__, fn state -> state.count end)
|
||||
end
|
||||
|
||||
# Get error count
|
||||
defp get_errors do
|
||||
Agent.get(__MODULE__, fn state -> state.errors end)
|
||||
end
|
||||
|
||||
# Normalize text content
|
||||
defp normalize_content(content) when is_binary(content) do
|
||||
{:ok,
|
||||
content
|
||||
|> String.trim()
|
||||
|> String.replace(~r/\s+/, " ")}
|
||||
end
|
||||
|
||||
defp normalize_content(_), do: {:error, "Content is not a string"}
|
||||
end
|
||||
|
||||
# Register the stateful plugin
|
||||
:ok = Kreuzberg.Plugin.register_post_processor(:stateful, MyApp.Plugins.StatefulTextProcessor)
|
||||
|
||||
# Initialize the plugin
|
||||
:ok = MyApp.Plugins.StatefulTextProcessor.initialize()
|
||||
|
||||
# Process first document
|
||||
result1 = %{"content" => " Example text with spaces "}
|
||||
processed1 = MyApp.Plugins.StatefulTextProcessor.process(result1, nil)
|
||||
IO.inspect(processed1, label: "First processing")
|
||||
|
||||
# Process second document
|
||||
result2 = %{"content" => "Another document\nwith\tmultiple\nlines"}
|
||||
processed2 = MyApp.Plugins.StatefulTextProcessor.process(result2, nil)
|
||||
IO.inspect(processed2, label: "Second processing")
|
||||
|
||||
# The state persists across calls
|
||||
IO.puts("Documents processed: #{processed2["processed_count"]}")
|
||||
|
||||
# Verify plugin is registered
|
||||
{:ok, processors} = Kreuzberg.Plugin.list_post_processors()
|
||||
IO.inspect(processors, label: "Registered processors")
|
||||
|
||||
# Cleanup
|
||||
:ok = MyApp.Plugins.StatefulTextProcessor.shutdown()
|
||||
:ok = Kreuzberg.Plugin.unregister_post_processor(:stateful)
|
||||
```
|
||||
189
docs/snippets/elixir/plugins/12_clear_plugins.exs
Normal file
189
docs/snippets/elixir/plugins/12_clear_plugins.exs
Normal file
@@ -0,0 +1,189 @@
|
||||
```elixir title="Elixir"
|
||||
# Clear all plugins from the registry - useful for testing or resetting state
|
||||
|
||||
# Define multiple example plugins for demonstration
|
||||
defmodule MyApp.Plugins.CleanupExample do
|
||||
@moduledoc """
|
||||
Example plugins to demonstrate clearing the registry.
|
||||
"""
|
||||
|
||||
# Simple post-processor
|
||||
defmodule TextCleaner do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
@impl true
|
||||
def name, do: "text_cleaner"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def processing_stage, do: :early
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def process(result, _config) do
|
||||
Map.put(result, "cleaned", true)
|
||||
end
|
||||
end
|
||||
|
||||
# Simple validator
|
||||
defmodule ContentValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
@impl true
|
||||
def name, do: "content_validator"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def priority, do: 50
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def should_validate?(%{"content" => content}) do
|
||||
is_binary(content) and byte_size(content) > 0
|
||||
end
|
||||
|
||||
def should_validate?(_), do: false
|
||||
|
||||
@impl true
|
||||
def validate(%{"content" => content}) do
|
||||
if String.length(content) > 0 do
|
||||
:ok
|
||||
else
|
||||
{:error, "Content cannot be empty"}
|
||||
end
|
||||
end
|
||||
|
||||
def validate(_), do: {:error, "Missing content field"}
|
||||
end
|
||||
|
||||
# Simple OCR backend
|
||||
defmodule MockOCRBackend do
|
||||
@behaviour Kreuzberg.Plugin.OcrBackend
|
||||
|
||||
@impl true
|
||||
def name, do: "mock_ocr"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def supported_languages, do: ["eng", "deu", "fra"]
|
||||
|
||||
@impl true
|
||||
def process_image(_image_data, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "OCR extracted text"}
|
||||
else
|
||||
{:error, "Unsupported language: #{language}"}
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process_file(_path, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "OCR extracted file text"}
|
||||
else
|
||||
{:error, "Unsupported language: #{language}"}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register multiple plugins
|
||||
IO.puts("=== Registering Plugins ===")
|
||||
:ok = Kreuzberg.Plugin.register_post_processor(:cleaner, MyApp.Plugins.CleanupExample.TextCleaner)
|
||||
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.CleanupExample.ContentValidator)
|
||||
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.CleanupExample.MockOCRBackend)
|
||||
|
||||
# List registered plugins before clearing
|
||||
{:ok, post_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, validators} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, ocr_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Before clearing:")
|
||||
IO.puts(" Post-processors: #{length(post_procs)}")
|
||||
IO.puts(" Validators: #{length(validators)}")
|
||||
IO.puts(" OCR backends: #{length(ocr_backends)}")
|
||||
|
||||
# Clear post-processors
|
||||
IO.puts("\n=== Clearing Post-Processors ===")
|
||||
:ok = Kreuzberg.Plugin.clear_post_processors()
|
||||
{:ok, post_procs_after} = Kreuzberg.Plugin.list_post_processors()
|
||||
IO.puts("Post-processors after clearing: #{length(post_procs_after)}")
|
||||
|
||||
# Validators and OCR backends should still be registered
|
||||
{:ok, validators_check} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, ocr_backends_check} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
IO.puts("Validators still registered: #{length(validators_check)}")
|
||||
IO.puts("OCR backends still registered: #{length(ocr_backends_check)}")
|
||||
|
||||
# Clear validators
|
||||
IO.puts("\n=== Clearing Validators ===")
|
||||
:ok = Kreuzberg.Plugin.clear_validators()
|
||||
{:ok, validators_after} = Kreuzberg.Plugin.list_validators()
|
||||
IO.puts("Validators after clearing: #{length(validators_after)}")
|
||||
|
||||
# OCR backends should still be registered
|
||||
{:ok, ocr_backends_check2} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
IO.puts("OCR backends still registered: #{length(ocr_backends_check2)}")
|
||||
|
||||
# Clear OCR backends
|
||||
IO.puts("\n=== Clearing OCR Backends ===")
|
||||
:ok = Kreuzberg.Plugin.clear_ocr_backends()
|
||||
{:ok, ocr_backends_after} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
IO.puts("OCR backends after clearing: #{length(ocr_backends_after)}")
|
||||
|
||||
# Verify all are cleared
|
||||
IO.puts("\n=== Final State (All Cleared) ===")
|
||||
{:ok, final_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, final_validators} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, final_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Post-processors: #{length(final_procs)}")
|
||||
IO.puts("Validators: #{length(final_validators)}")
|
||||
IO.puts("OCR backends: #{length(final_backends)}")
|
||||
|
||||
# Use case: Reset plugin state for testing
|
||||
IO.puts("\n=== Common Use Case: Testing Setup/Teardown ===")
|
||||
|
||||
# Setup for test
|
||||
Kreuzberg.Plugin.register_post_processor(:test_proc, MyApp.Plugins.CleanupExample.TextCleaner)
|
||||
Kreuzberg.Plugin.register_validator(MyApp.Plugins.CleanupExample.ContentValidator)
|
||||
|
||||
# Run test
|
||||
{:ok, test_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, test_vals} = Kreuzberg.Plugin.list_validators()
|
||||
IO.puts("Test setup complete: #{length(test_procs)} processors, #{length(test_vals)} validators")
|
||||
|
||||
# Teardown - clear everything
|
||||
Kreuzberg.Plugin.clear_post_processors()
|
||||
Kreuzberg.Plugin.clear_validators()
|
||||
Kreuzberg.Plugin.clear_ocr_backends()
|
||||
|
||||
# Verify clean state for next test
|
||||
{:ok, clean_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, clean_vals} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, clean_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Test teardown complete: #{length(clean_procs)} processors, #{length(clean_vals)} validators, #{length(clean_backends)} backends")
|
||||
```
|
||||
358
docs/snippets/elixir/plugins/13_unregister_plugins.exs
Normal file
358
docs/snippets/elixir/plugins/13_unregister_plugins.exs
Normal file
@@ -0,0 +1,358 @@
|
||||
```elixir title="Elixir"
|
||||
# Unregister individual plugins from the registry
|
||||
|
||||
defmodule MyApp.Plugins.UnregisterExample do
|
||||
@moduledoc """
|
||||
Example plugins to demonstrate selective unregistration.
|
||||
"""
|
||||
|
||||
# Email processor post-processor
|
||||
defmodule EmailPostProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
@impl true
|
||||
def name, do: "email_processor"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def processing_stage, do: :middle
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def process(result, _config) do
|
||||
# Extract emails from content
|
||||
emails =
|
||||
result
|
||||
|> Map.get("content", "")
|
||||
|> String.scan(~r/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/)
|
||||
|> Enum.map(&List.first/1)
|
||||
|> Enum.uniq()
|
||||
|
||||
Map.put(result, "extracted_emails", emails)
|
||||
end
|
||||
end
|
||||
|
||||
# Phone number processor post-processor
|
||||
defmodule PhonePostProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
@impl true
|
||||
def name, do: "phone_processor"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def processing_stage, do: :middle
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def process(result, _config) do
|
||||
# Extract phone numbers from content
|
||||
phones =
|
||||
result
|
||||
|> Map.get("content", "")
|
||||
|> String.scan(~r/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/)
|
||||
|> Enum.map(&List.first/1)
|
||||
|> Enum.uniq()
|
||||
|
||||
Map.put(result, "extracted_phones", phones)
|
||||
end
|
||||
end
|
||||
|
||||
# URL processor post-processor
|
||||
defmodule URLPostProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
@impl true
|
||||
def name, do: "url_processor"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def processing_stage, do: :middle
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def process(result, _config) do
|
||||
# Extract URLs from content
|
||||
urls =
|
||||
result
|
||||
|> Map.get("content", "")
|
||||
|> String.scan(~r/https?:\/\/\S+/)
|
||||
|> Enum.map(&List.first/1)
|
||||
|> Enum.uniq()
|
||||
|
||||
Map.put(result, "extracted_urls", urls)
|
||||
end
|
||||
end
|
||||
|
||||
# Strict length validator
|
||||
defmodule StrictLengthValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
@impl true
|
||||
def name, do: "strict_length_validator"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def priority, do: 100
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def should_validate?(%{"content" => content}) do
|
||||
is_binary(content)
|
||||
end
|
||||
|
||||
def should_validate?(_), do: false
|
||||
|
||||
@impl true
|
||||
def validate(%{"content" => content}) do
|
||||
min_length = 10
|
||||
max_length = 10000
|
||||
|
||||
cond do
|
||||
byte_size(content) < min_length ->
|
||||
{:error, "Content too short (minimum #{min_length} bytes)"}
|
||||
|
||||
byte_size(content) > max_length ->
|
||||
{:error, "Content too long (maximum #{max_length} bytes)"}
|
||||
|
||||
true ->
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
def validate(_), do: {:error, "Missing content field"}
|
||||
end
|
||||
|
||||
# Encoding validator
|
||||
defmodule EncodingValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
@impl true
|
||||
def name, do: "encoding_validator"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def priority, do: 50
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def should_validate?(%{"content" => content}) do
|
||||
is_binary(content)
|
||||
end
|
||||
|
||||
def should_validate?(_), do: false
|
||||
|
||||
@impl true
|
||||
def validate(%{"content" => content}) do
|
||||
if String.valid?(content) do
|
||||
:ok
|
||||
else
|
||||
{:error, "Content contains invalid UTF-8 encoding"}
|
||||
end
|
||||
end
|
||||
|
||||
def validate(_), do: {:error, "Missing content field"}
|
||||
end
|
||||
|
||||
# Basic OCR backend
|
||||
defmodule BasicOCR do
|
||||
@behaviour Kreuzberg.Plugin.OcrBackend
|
||||
|
||||
@impl true
|
||||
def name, do: "basic_ocr"
|
||||
|
||||
@impl true
|
||||
def version, do: "1.0.0"
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def supported_languages, do: ["eng", "fra"]
|
||||
|
||||
@impl true
|
||||
def process_image(_image_data, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "Extracted text"}
|
||||
else
|
||||
{:error, "Unsupported language"}
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process_file(_path, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "Extracted file text"}
|
||||
else
|
||||
{:error, "Unsupported language"}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Advanced OCR backend
|
||||
defmodule AdvancedOCR do
|
||||
@behaviour Kreuzberg.Plugin.OcrBackend
|
||||
|
||||
@impl true
|
||||
def name, do: "advanced_ocr"
|
||||
|
||||
@impl true
|
||||
def version, do: "2.0.0"
|
||||
|
||||
@impl true
|
||||
def initialize, do: :ok
|
||||
|
||||
@impl true
|
||||
def shutdown, do: :ok
|
||||
|
||||
@impl true
|
||||
def supported_languages do
|
||||
[
|
||||
"eng",
|
||||
"fra",
|
||||
"deu",
|
||||
"spa",
|
||||
"ita",
|
||||
"jpn",
|
||||
"chi",
|
||||
"chi_tra",
|
||||
"kor"
|
||||
]
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process_image(_image_data, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "Advanced extracted text"}
|
||||
else
|
||||
{:error, "Unsupported language"}
|
||||
end
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process_file(_path, language) do
|
||||
if language in supported_languages() do
|
||||
{:ok, "Advanced extracted file text"}
|
||||
else
|
||||
{:error, "Unsupported language"}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
IO.puts("=== Plugin Unregistration Example ===\n")
|
||||
|
||||
# Register multiple plugins of each type
|
||||
IO.puts("Registering plugins...")
|
||||
:ok = Kreuzberg.Plugin.register_post_processor(:emails, MyApp.Plugins.UnregisterExample.EmailPostProcessor)
|
||||
:ok = Kreuzberg.Plugin.register_post_processor(:phones, MyApp.Plugins.UnregisterExample.PhonePostProcessor)
|
||||
:ok = Kreuzberg.Plugin.register_post_processor(:urls, MyApp.Plugins.UnregisterExample.URLPostProcessor)
|
||||
|
||||
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.UnregisterExample.StrictLengthValidator)
|
||||
:ok = Kreuzberg.Plugin.register_validator(MyApp.Plugins.UnregisterExample.EncodingValidator)
|
||||
|
||||
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.UnregisterExample.BasicOCR)
|
||||
:ok = Kreuzberg.Plugin.register_ocr_backend(MyApp.Plugins.UnregisterExample.AdvancedOCR)
|
||||
|
||||
# List all registered plugins
|
||||
{:ok, procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, vals} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Initial registration:")
|
||||
IO.puts(" Post-processors: #{length(procs)} - #{inspect(Enum.map(procs, &elem(&1, 0)))}")
|
||||
IO.puts(" Validators: #{length(vals)} - #{inspect(Enum.map(vals, &(elem(&1, :__struct__) || &1.name())))}")
|
||||
IO.puts(" OCR backends: #{length(backends)} - #{inspect(Enum.map(backends, &(elem(&1, :__struct__) || &1.name())))}\n")
|
||||
|
||||
# Unregister individual post-processor
|
||||
IO.puts("Unregistering post-processor ':phones'...")
|
||||
:ok = Kreuzberg.Plugin.unregister_post_processor(:phones)
|
||||
{:ok, procs_after1} = Kreuzberg.Plugin.list_post_processors()
|
||||
IO.puts("Post-processors: #{length(procs_after1)} - #{inspect(Enum.map(procs_after1, &elem(&1, 0)))}\n")
|
||||
|
||||
# Unregister another post-processor
|
||||
IO.puts("Unregistering post-processor ':urls'...")
|
||||
:ok = Kreuzberg.Plugin.unregister_post_processor(:urls)
|
||||
{:ok, procs_after2} = Kreuzberg.Plugin.list_post_processors()
|
||||
IO.puts("Post-processors: #{length(procs_after2)} - #{inspect(Enum.map(procs_after2, &elem(&1, 0)))}\n")
|
||||
|
||||
# Unregister a validator
|
||||
IO.puts("Unregistering validator 'EncodingValidator'...")
|
||||
:ok = Kreuzberg.Plugin.unregister_validator(MyApp.Plugins.UnregisterExample.EncodingValidator)
|
||||
{:ok, vals_after} = Kreuzberg.Plugin.list_validators()
|
||||
IO.puts("Validators: #{length(vals_after)}\n")
|
||||
|
||||
# Unregister an OCR backend
|
||||
IO.puts("Unregistering OCR backend 'BasicOCR'...")
|
||||
:ok = Kreuzberg.Plugin.unregister_ocr_backend(MyApp.Plugins.UnregisterExample.BasicOCR)
|
||||
{:ok, backends_after} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
IO.puts("OCR backends: #{length(backends_after)}\n")
|
||||
|
||||
# Idempotent unregistration - unregistering non-existent plugin
|
||||
IO.puts("Unregistering already-unregistered plugin ':phones' (idempotent)...")
|
||||
:ok = Kreuzberg.Plugin.unregister_post_processor(:phones)
|
||||
IO.puts("Still returns :ok\n")
|
||||
|
||||
# Final state
|
||||
IO.puts("=== Final State ===")
|
||||
{:ok, final_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, final_vals} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, final_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Remaining post-processors: #{length(final_procs)}")
|
||||
IO.puts("Remaining validators: #{length(final_vals)}")
|
||||
IO.puts("Remaining OCR backends: #{length(final_backends)}\n")
|
||||
|
||||
# Cleanup - unregister remaining plugins
|
||||
IO.puts("=== Cleanup ===")
|
||||
:ok = Kreuzberg.Plugin.unregister_post_processor(:emails)
|
||||
:ok = Kreuzberg.Plugin.unregister_validator(MyApp.Plugins.UnregisterExample.StrictLengthValidator)
|
||||
:ok = Kreuzberg.Plugin.unregister_ocr_backend(MyApp.Plugins.UnregisterExample.AdvancedOCR)
|
||||
|
||||
{:ok, final_clean_procs} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, final_clean_vals} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, final_clean_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("After cleanup:")
|
||||
IO.puts(" Post-processors: #{length(final_clean_procs)}")
|
||||
IO.puts(" Validators: #{length(final_clean_vals)}")
|
||||
IO.puts(" OCR backends: #{length(final_clean_backends)}")
|
||||
```
|
||||
15
docs/snippets/elixir/plugins/clear_plugins.md
Normal file
15
docs/snippets/elixir/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```elixir title="Elixir"
|
||||
# Clear all document extractors from the global registry
|
||||
{:ok, _} = Kreuzberg.clear_document_extractors()
|
||||
|
||||
# Clear all OCR backends from the global registry
|
||||
{:ok, _} = Kreuzberg.clear_ocr_backends()
|
||||
|
||||
# Clear all post-processors from the global registry
|
||||
{:ok, _} = Kreuzberg.clear_post_processors()
|
||||
|
||||
# Clear all validators from the global registry
|
||||
{:ok, _} = Kreuzberg.clear_validators()
|
||||
|
||||
IO.puts("All plugins cleared")
|
||||
```
|
||||
10
docs/snippets/elixir/plugins/embedding_backend.md
Normal file
10
docs/snippets/elixir/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,10 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Embedding backend registration is not available in the Elixir binding. Custom embedding backends must be implemented in Rust using the `EmbeddingBackend` trait and registered in the Rust core before being used by Elixir.
|
||||
|
||||
To use custom embeddings in Elixir:
|
||||
|
||||
1. Implement the embedding backend in Rust (in `crates/kreuzberg/src/plugins/embedding.rs` or a separate Rust crate)
|
||||
2. Register the backend in the Rust core initialization
|
||||
3. Call the embeddings functions from Elixir with the appropriate config
|
||||
|
||||
See the Rust plugin documentation for implementing custom `EmbeddingBackend` plugins.
|
||||
61
docs/snippets/elixir/plugins/extractor_registration.exs
Normal file
61
docs/snippets/elixir/plugins/extractor_registration.exs
Normal file
@@ -0,0 +1,61 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule MyApp.MetadataEnricher do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
def process(result, _config) do
|
||||
enriched_metadata = result.metadata || %{}
|
||||
enriched_metadata = Map.put(enriched_metadata, "enriched_at", DateTime.utc_now())
|
||||
{:ok, %{result | metadata: enriched_metadata}}
|
||||
end
|
||||
|
||||
def initialize, do: :ok
|
||||
def shutdown, do: :ok
|
||||
def version, do: "1.0.0"
|
||||
end
|
||||
|
||||
defmodule MyApp.LinkExtractor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
def process(result, _config) do
|
||||
links = extract_links(result.content)
|
||||
metadata = result.metadata || %{}
|
||||
metadata = Map.put(metadata, "links", links)
|
||||
{:ok, %{result | metadata: metadata}}
|
||||
end
|
||||
|
||||
defp extract_links(content) do
|
||||
Regex.scan(~r/https?:\/\/\S+/, content)
|
||||
|> Enum.map(&List.first/1)
|
||||
|> Enum.uniq()
|
||||
end
|
||||
|
||||
def initialize, do: :ok
|
||||
def shutdown, do: :ok
|
||||
def version, do: "1.0.0"
|
||||
end
|
||||
|
||||
defmodule MyApp.QualityValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
def validate(result) do
|
||||
if String.length(result.content) > 100 do
|
||||
:ok
|
||||
else
|
||||
{:error, "Content quality too low"}
|
||||
end
|
||||
end
|
||||
|
||||
def should_validate?(_result), do: true
|
||||
def priority, do: 5
|
||||
def initialize, do: :ok
|
||||
def shutdown, do: :ok
|
||||
def version, do: "1.0.0"
|
||||
end
|
||||
|
||||
# Register multiple plugins
|
||||
Kreuzberg.Plugin.register_post_processor(:metadata_enricher, MyApp.MetadataEnricher)
|
||||
Kreuzberg.Plugin.register_post_processor(:link_extractor, MyApp.LinkExtractor)
|
||||
Kreuzberg.Plugin.register_validator(MyApp.QualityValidator)
|
||||
|
||||
IO.puts("Plugins registered successfully")
|
||||
```
|
||||
10
docs/snippets/elixir/plugins/extractor_registration.md
Normal file
10
docs/snippets/elixir/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,10 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Document extractor registration is not available in the Elixir binding. Custom extractors must be implemented in Rust using the `DocumentExtractor` trait and registered in the Rust core.
|
||||
|
||||
To use custom extractors in Elixir:
|
||||
|
||||
1. Implement the extractor in Rust using the `DocumentExtractor` trait
|
||||
2. Register the extractor in the Rust core's registry
|
||||
3. Call the extraction functions from Elixir
|
||||
|
||||
See the Rust plugin documentation for implementing custom `DocumentExtractor` plugins.
|
||||
23
docs/snippets/elixir/plugins/list_plugins.exs
Normal file
23
docs/snippets/elixir/plugins/list_plugins.exs
Normal file
@@ -0,0 +1,23 @@
|
||||
```elixir title="Elixir"
|
||||
# List all registered plugins
|
||||
{:ok, post_processors} = Kreuzberg.Plugin.list_post_processors()
|
||||
{:ok, validators} = Kreuzberg.Plugin.list_validators()
|
||||
{:ok, ocr_backends} = Kreuzberg.Plugin.list_ocr_backends()
|
||||
|
||||
IO.puts("Post-processors:")
|
||||
Enum.each(post_processors, fn {name, module} ->
|
||||
IO.puts(" - #{name}: #{module}")
|
||||
end)
|
||||
|
||||
IO.puts("\nValidators:")
|
||||
Enum.each(validators, fn module ->
|
||||
IO.puts(" - #{module}")
|
||||
end)
|
||||
|
||||
IO.puts("\nOCR backends:")
|
||||
Enum.each(ocr_backends, fn module ->
|
||||
IO.puts(" - #{module}")
|
||||
end)
|
||||
|
||||
IO.puts("\nTotal: #{length(post_processors)} post-processors, #{length(validators)} validators, #{length(ocr_backends)} OCR backends")
|
||||
```
|
||||
17
docs/snippets/elixir/plugins/list_plugins.md
Normal file
17
docs/snippets/elixir/plugins/list_plugins.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```elixir title="Elixir"
|
||||
# List all registered document extractors
|
||||
{:ok, extractors} = Kreuzberg.list_document_extractors()
|
||||
IO.inspect(extractors, label: "Document extractors")
|
||||
|
||||
# List all registered OCR backends
|
||||
{:ok, backends} = Kreuzberg.list_ocr_backends()
|
||||
IO.inspect(backends, label: "OCR backends")
|
||||
|
||||
# List all registered post-processors
|
||||
{:ok, processors} = Kreuzberg.list_post_processors()
|
||||
IO.inspect(processors, label: "Post-processors")
|
||||
|
||||
# List all registered validators
|
||||
{:ok, validators} = Kreuzberg.list_validators()
|
||||
IO.inspect(validators, label: "Validators")
|
||||
```
|
||||
38
docs/snippets/elixir/plugins/min_length_validator.exs
Normal file
38
docs/snippets/elixir/plugins/min_length_validator.exs
Normal file
@@ -0,0 +1,38 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule MinLengthValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
@min_length 50
|
||||
|
||||
def name, do: "min_length_validator"
|
||||
|
||||
def validate(result) do
|
||||
if String.length(result.content) >= @min_length do
|
||||
:ok
|
||||
else
|
||||
{:error, "Content too short"}
|
||||
end
|
||||
end
|
||||
|
||||
def should_validate?(_result), do: true
|
||||
def priority, do: 1
|
||||
def initialize, do: :ok
|
||||
def shutdown, do: :ok
|
||||
def version, do: "1.0.0"
|
||||
end
|
||||
|
||||
# Register the validator
|
||||
Kreuzberg.Plugin.register_validator(MinLengthValidator)
|
||||
|
||||
# Example usage with extraction
|
||||
{:ok, result} = Kreuzberg.extract_file(
|
||||
"document.pdf",
|
||||
nil
|
||||
)
|
||||
|
||||
case result do
|
||||
result ->
|
||||
IO.puts("Extraction successful")
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
end
|
||||
```
|
||||
45
docs/snippets/elixir/plugins/min_length_validator.md
Normal file
45
docs/snippets/elixir/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,45 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom validator implementation is not available in the Elixir binding. Validators must be implemented in Rust using the `Validator` trait.
|
||||
|
||||
To implement a minimum length validator in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{Plugin, Validator};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct MinLengthValidator {
|
||||
min_length: usize,
|
||||
}
|
||||
|
||||
impl Plugin for MinLengthValidator {
|
||||
fn name(&self) -> &str { "min-length-validator" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Validator for MinLengthValidator {
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
if result.content.len() < self.min_length {
|
||||
return Err(KreuzbergError::validation(format!(
|
||||
"Content too short: {} < {} characters",
|
||||
result.content.len(),
|
||||
self.min_length
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 {
|
||||
100
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Register this in your Rust initialization and Elixir will be able to use it.
|
||||
11
docs/snippets/elixir/plugins/pdf_metadata_extractor.md
Normal file
11
docs/snippets/elixir/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,11 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom PDF metadata extractor implementation is not available in the Elixir binding. Document extractors must be implemented in Rust using the `DocumentExtractor` trait.
|
||||
|
||||
To implement a custom PDF metadata extractor in Rust:
|
||||
|
||||
1. Implement the `Plugin` and `DocumentExtractor` traits
|
||||
2. Add support for PDF MIME types: `application/pdf`
|
||||
3. Use a PDF library (e.g., pdfium-render, pdf crate) to extract metadata
|
||||
4. Register the extractor in the Rust core
|
||||
|
||||
See the Rust plugin documentation for implementing custom `DocumentExtractor` plugins for PDF files.
|
||||
97
docs/snippets/elixir/plugins/pdf_only_processor.exs
Normal file
97
docs/snippets/elixir/plugins/pdf_only_processor.exs
Normal file
@@ -0,0 +1,97 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
|
||||
# PDF-Only Post-Processor Plugin
|
||||
# This conditional post-processor only processes PDF files.
|
||||
# It can be used to apply PDF-specific transformations.
|
||||
|
||||
defmodule MyApp.Plugins.PdfOnlyProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"PdfOnlyProcessor"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def processing_stage do
|
||||
:post
|
||||
end
|
||||
|
||||
@impl true
|
||||
def version do
|
||||
"1.0.0"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _options) do
|
||||
mime_type = result["mime_type"] || ""
|
||||
|
||||
if mime_type == "application/pdf" do
|
||||
# Process only for PDF files
|
||||
Logger.info("Processing PDF content with custom PDF processor")
|
||||
|
||||
# Example: Extract PDF-specific metadata or transform content
|
||||
content = result["content"] || ""
|
||||
|
||||
# Add PDF processing marker to metadata
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
updated_metadata = Map.put(metadata, "processed_by_pdf_processor", true)
|
||||
|
||||
{:ok, Map.put(result, "metadata", updated_metadata)}
|
||||
else
|
||||
{:ok, result}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register the PDF-only post-processor
|
||||
Plugin.register_post_processor(:pdf_only_processor, MyApp.Plugins.PdfOnlyProcessor)
|
||||
|
||||
# Example usage with PDF result
|
||||
pdf_result = %{
|
||||
"content" => "PDF extracted content here",
|
||||
"mime_type" => "application/pdf",
|
||||
"metadata" => %{
|
||||
"source" => "document.pdf",
|
||||
"pages" => 5
|
||||
}
|
||||
}
|
||||
|
||||
# Process PDF result
|
||||
case MyApp.Plugins.PdfOnlyProcessor.process(pdf_result, %{}) do
|
||||
{:ok, processed_result} ->
|
||||
IO.puts("PDF processing complete")
|
||||
IO.inspect(processed_result, label: "PDF Result")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("PDF processing failed: #{reason}")
|
||||
end
|
||||
|
||||
# Example with non-PDF result (processor will skip processing)
|
||||
non_pdf_result = %{
|
||||
"content" => "Image extracted content",
|
||||
"mime_type" => "image/png",
|
||||
"metadata" => %{}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.PdfOnlyProcessor.process(non_pdf_result, %{}) do
|
||||
{:ok, processed_result} ->
|
||||
IO.puts("Processing complete (skipped for non-PDF)")
|
||||
IO.inspect(processed_result, label: "Non-PDF Result")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Processing failed: #{reason}")
|
||||
end
|
||||
```
|
||||
41
docs/snippets/elixir/plugins/pdf_only_processor.md
Normal file
41
docs/snippets/elixir/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,41 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom post-processor implementation is not available in the Elixir binding. Post-processors must be implemented in Rust using the `PostProcessor` trait.
|
||||
|
||||
To implement a PDF-only post-processor in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct PdfOnlyProcessor;
|
||||
|
||||
impl Plugin for PdfOnlyProcessor {
|
||||
fn name(&self) -> &str { "pdf-only" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for PdfOnlyProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
// Custom processing logic for PDF documents
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> bool {
|
||||
result.mime_type == "application/pdf"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Register this in your Rust initialization and Elixir will use it automatically during extraction.
|
||||
35
docs/snippets/elixir/plugins/plugin_extractor.exs
Normal file
35
docs/snippets/elixir/plugins/plugin_extractor.exs
Normal file
@@ -0,0 +1,35 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule MyApp.WordCountProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
|
||||
def name, do: "word_count_processor"
|
||||
|
||||
def version, do: "1.0.0"
|
||||
|
||||
def processing_stage, do: :late
|
||||
|
||||
def initialize, do: :ok
|
||||
|
||||
def shutdown, do: :ok
|
||||
|
||||
def process(result, _config) do
|
||||
word_count = result.content
|
||||
|> String.split()
|
||||
|> Enum.count()
|
||||
|
||||
metadata = Map.put(result.metadata || %{}, "word_count", word_count)
|
||||
%{result | metadata: metadata}
|
||||
end
|
||||
end
|
||||
|
||||
# Register post-processor
|
||||
Kreuzberg.Plugin.register_post_processor(MyApp.WordCountProcessor)
|
||||
|
||||
# Use with extraction
|
||||
{:ok, result} = Kreuzberg.extract_file(
|
||||
"document.pdf",
|
||||
nil
|
||||
)
|
||||
|
||||
IO.puts("Word count: #{result.metadata["word_count"]}")
|
||||
```
|
||||
60
docs/snippets/elixir/plugins/plugin_extractor.md
Normal file
60
docs/snippets/elixir/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,60 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom document extractor implementation is not available in the Elixir binding. Extractors must be implemented in Rust using the `DocumentExtractor` trait.
|
||||
|
||||
To implement a custom JSON extractor in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{DocumentExtractor, Plugin};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, Metadata};
|
||||
use async_trait::async_trait;
|
||||
use std::path::Path;
|
||||
|
||||
struct CustomJsonExtractor;
|
||||
|
||||
impl Plugin for CustomJsonExtractor {
|
||||
fn name(&self) -> &str { "custom-json-extractor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for CustomJsonExtractor {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
_mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
let json: serde_json::Value = serde_json::from_slice(content)?;
|
||||
let text = extract_text_from_json(&json);
|
||||
|
||||
Ok(ExtractionResult {
|
||||
content: text,
|
||||
mime_type: "application/json".to_string(),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
fn supported_mime_types(&self) -> &[&str] {
|
||||
&["application/json", "text/json"]
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
}
|
||||
|
||||
fn extract_text_from_json(value: &serde_json::Value) -> String {
|
||||
match value {
|
||||
serde_json::Value::String(s) => format!("{}\n", s),
|
||||
serde_json::Value::Array(arr) => {
|
||||
arr.iter().map(extract_text_from_json).collect()
|
||||
}
|
||||
serde_json::Value::Object(obj) => {
|
||||
obj.values().map(extract_text_from_json).collect()
|
||||
}
|
||||
_ => String::new(),
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Register this in your Rust core and it will be available in Elixir.
|
||||
125
docs/snippets/elixir/plugins/plugin_logging.exs
Normal file
125
docs/snippets/elixir/plugins/plugin_logging.exs
Normal file
@@ -0,0 +1,125 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
require Logger
|
||||
|
||||
# Plugin with Logging
|
||||
# This example demonstrates best practices for logging in plugins.
|
||||
# Proper logging helps with debugging and monitoring plugin execution.
|
||||
|
||||
defmodule MyApp.Plugins.LoggingProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"LoggingProcessor"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def processing_stage do
|
||||
:post
|
||||
end
|
||||
|
||||
@impl true
|
||||
def version do
|
||||
"1.0.0"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _options) do
|
||||
try do
|
||||
Logger.debug("Starting content processing", plugin: "LoggingProcessor")
|
||||
|
||||
content = result["content"] || ""
|
||||
content_size = byte_size(content)
|
||||
mime_type = result["mime_type"] || "unknown"
|
||||
|
||||
Logger.info("Processing extraction result",
|
||||
mime_type: mime_type,
|
||||
content_size: content_size
|
||||
)
|
||||
|
||||
# Perform processing
|
||||
processed_content = clean_content(content)
|
||||
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
updated_metadata = metadata
|
||||
|> Map.put("processed_at", DateTime.utc_now())
|
||||
|> Map.put("original_size", content_size)
|
||||
|> Map.put("processed_size", byte_size(processed_content))
|
||||
|
||||
Logger.debug("Processing complete",
|
||||
original_size: content_size,
|
||||
processed_size: byte_size(processed_content)
|
||||
)
|
||||
|
||||
{:ok, Map.put(result, "metadata", updated_metadata)}
|
||||
rescue
|
||||
error ->
|
||||
Logger.error("Processing error in LoggingProcessor",
|
||||
error: inspect(error),
|
||||
stacktrace: __STACKTRACE__
|
||||
)
|
||||
{:error, "Processing failed: #{inspect(error)}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp clean_content(content) do
|
||||
content
|
||||
|> String.trim()
|
||||
|> String.replace(~r/\s+/, " ")
|
||||
end
|
||||
end
|
||||
|
||||
# Register the logging processor
|
||||
Plugin.register_post_processor(:logging_processor, MyApp.Plugins.LoggingProcessor)
|
||||
|
||||
# Configure logging (use :info or :debug for verbosity)
|
||||
Logger.configure(level: :debug)
|
||||
|
||||
IO.puts("=== Plugin Logging Example ===\n")
|
||||
|
||||
# Example usage
|
||||
result = %{
|
||||
"content" => " Sample document with irregular spacing ",
|
||||
"mime_type" => "application/pdf",
|
||||
"metadata" => %{"source" => "document.pdf"}
|
||||
}
|
||||
|
||||
Logger.info("Starting extraction processing example")
|
||||
|
||||
case MyApp.Plugins.LoggingProcessor.process(result, %{}) do
|
||||
{:ok, processed_result} ->
|
||||
IO.puts("\nProcessing succeeded!")
|
||||
IO.inspect(processed_result, label: "Processed Result")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Processing failed: #{reason}")
|
||||
end
|
||||
|
||||
# Demonstrate error handling with logging
|
||||
Logger.info("Testing error handling")
|
||||
|
||||
invalid_result = %{
|
||||
"content" => nil,
|
||||
"mime_type" => "application/pdf"
|
||||
}
|
||||
|
||||
case MyApp.Plugins.LoggingProcessor.process(invalid_result, %{}) do
|
||||
{:ok, _processed_result} ->
|
||||
IO.puts("Processing succeeded")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Processing failed as expected: #{reason}")
|
||||
end
|
||||
```
|
||||
27
docs/snippets/elixir/plugins/plugin_logging.md
Normal file
27
docs/snippets/elixir/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,27 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Plugin logging is not directly available in the Elixir binding. Logging must be implemented in the Rust plugin code itself using the `tracing` crate.
|
||||
|
||||
To add logging to a Rust plugin:
|
||||
|
||||
```rust
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
#[async_trait]
|
||||
impl DocumentExtractor for MyExtractor {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
content: &[u8],
|
||||
mime_type: &str,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<ExtractionResult> {
|
||||
debug!("Starting extraction for {}", mime_type);
|
||||
|
||||
// Extraction logic...
|
||||
|
||||
info!("Extraction completed for {}", mime_type);
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The logs will be captured by Kreuzberg's tracing infrastructure and can be monitored from Elixir through structured logging in the output.
|
||||
157
docs/snippets/elixir/plugins/plugin_testing.exs
Normal file
157
docs/snippets/elixir/plugins/plugin_testing.exs
Normal file
@@ -0,0 +1,157 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
|
||||
# Plugin Testing Example
|
||||
# This demonstrates how to test custom plugins with various scenarios.
|
||||
|
||||
defmodule MyApp.Plugins.CustomJsonExtractor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"CustomJsonExtractor"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def processing_stage do
|
||||
:post
|
||||
end
|
||||
|
||||
@impl true
|
||||
def version do
|
||||
"1.0.0"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _options) do
|
||||
mime_type = result["mime_type"] || ""
|
||||
|
||||
if mime_type == "application/json" do
|
||||
Logger.debug("Processing JSON content")
|
||||
{:ok, Map.put(result, "is_json", true)}
|
||||
else
|
||||
Logger.debug("Not JSON content, skipping")
|
||||
{:ok, result}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Test setup
|
||||
defmodule MyApp.Plugins.Test do
|
||||
require Logger
|
||||
|
||||
def run_tests do
|
||||
IO.puts("\n=== Running Plugin Tests ===\n")
|
||||
|
||||
test_json_processing()
|
||||
test_non_json_processing()
|
||||
test_empty_content()
|
||||
test_missing_mime_type()
|
||||
|
||||
IO.puts("\n=== All Tests Completed ===\n")
|
||||
end
|
||||
|
||||
defp test_json_processing do
|
||||
IO.puts("Test 1: JSON Content Processing")
|
||||
|
||||
result = %{
|
||||
"content" => ~s({"message": "Hello, world!"}),
|
||||
"mime_type" => "application/json",
|
||||
"metadata" => %{}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
|
||||
{:ok, processed} ->
|
||||
if processed["is_json"] == true do
|
||||
IO.puts(" PASS: JSON content marked correctly\n")
|
||||
else
|
||||
IO.puts(" FAIL: JSON flag not set\n")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(" FAIL: #{reason}\n")
|
||||
end
|
||||
end
|
||||
|
||||
defp test_non_json_processing do
|
||||
IO.puts("Test 2: Non-JSON Content Processing")
|
||||
|
||||
result = %{
|
||||
"content" => "Plain text content",
|
||||
"mime_type" => "text/plain",
|
||||
"metadata" => %{}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
|
||||
{:ok, processed} ->
|
||||
if not Map.has_key?(processed, "is_json") or !processed["is_json"] do
|
||||
IO.puts(" PASS: Non-JSON content not marked\n")
|
||||
else
|
||||
IO.puts(" FAIL: Non-JSON content incorrectly marked as JSON\n")
|
||||
end
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(" FAIL: #{reason}\n")
|
||||
end
|
||||
end
|
||||
|
||||
defp test_empty_content do
|
||||
IO.puts("Test 3: Empty Content")
|
||||
|
||||
result = %{
|
||||
"content" => "",
|
||||
"mime_type" => "application/json",
|
||||
"metadata" => %{}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
|
||||
{:ok, _processed} ->
|
||||
IO.puts(" PASS: Empty content handled gracefully\n")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(" FAIL: #{reason}\n")
|
||||
end
|
||||
end
|
||||
|
||||
defp test_missing_mime_type do
|
||||
IO.puts("Test 4: Missing MIME Type")
|
||||
|
||||
result = %{
|
||||
"content" => "Some content",
|
||||
"metadata" => %{}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.CustomJsonExtractor.process(result, %{}) do
|
||||
{:ok, _processed} ->
|
||||
IO.puts(" PASS: Missing MIME type handled gracefully\n")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(" FAIL: #{reason}\n")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register the custom plugin
|
||||
Plugin.register_post_processor(:custom_json_extractor, MyApp.Plugins.CustomJsonExtractor)
|
||||
|
||||
# Run the test suite
|
||||
MyApp.Plugins.Test.run_tests()
|
||||
|
||||
# List all registered post-processors
|
||||
{:ok, processors} = Plugin.list_post_processors()
|
||||
IO.puts("Registered Post-Processors:")
|
||||
Enum.each(processors, fn {name, module} ->
|
||||
IO.puts(" - #{name}: #{module}")
|
||||
end)
|
||||
```
|
||||
21
docs/snippets/elixir/plugins/plugin_testing.md
Normal file
21
docs/snippets/elixir/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Plugin testing in Elixir is limited since custom plugins cannot be implemented in Elixir. Plugin testing should be done in Rust using `#[cfg(test)]` test modules.
|
||||
|
||||
When testing custom plugins in Rust:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_custom_extractor() {
|
||||
let extractor = MyExtractor;
|
||||
let content = b"test content";
|
||||
let result = extractor.extract_bytes(content, "text/plain", &ExtractionConfig::default()).await;
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
For Elixir, you can test the extraction results using the built-in functions like `Kreuzberg.extract_bytes_async/3` and `Kreuzberg.extract_file_async/3`.
|
||||
32
docs/snippets/elixir/plugins/plugin_validator.exs
Normal file
32
docs/snippets/elixir/plugins/plugin_validator.exs
Normal file
@@ -0,0 +1,32 @@
|
||||
```elixir title="Elixir"
|
||||
defmodule MyApp.MinLengthValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
|
||||
def name, do: "min_length_validator"
|
||||
|
||||
def validate(result) do
|
||||
if String.length(result.content) >= 50 do
|
||||
:ok
|
||||
else
|
||||
{:error, "Content too short: #{String.length(result.content)} chars"}
|
||||
end
|
||||
end
|
||||
|
||||
def should_validate?(_result), do: true
|
||||
def priority, do: 10
|
||||
def initialize, do: :ok
|
||||
def shutdown, do: :ok
|
||||
def version, do: "1.0.0"
|
||||
end
|
||||
|
||||
# Register validator
|
||||
Kreuzberg.Plugin.register_validator(MyApp.MinLengthValidator)
|
||||
|
||||
# Use with extraction
|
||||
{:ok, result} = Kreuzberg.extract_file(
|
||||
"document.pdf",
|
||||
nil
|
||||
)
|
||||
|
||||
IO.puts("Content length: #{String.length(result.content)}")
|
||||
```
|
||||
10
docs/snippets/elixir/plugins/plugin_validator.md
Normal file
10
docs/snippets/elixir/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,10 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom validator implementation is not available in the Elixir binding. Validators must be implemented in Rust using the `Validator` trait.
|
||||
|
||||
To implement a custom validator in Rust and use it from Elixir:
|
||||
|
||||
1. Implement the `Plugin` and `Validator` traits in Rust
|
||||
2. Register the validator in the Rust core
|
||||
3. Call extraction functions from Elixir, which will automatically apply registered validators
|
||||
|
||||
The validator will run after extraction completes and can reject results that don't meet validation criteria.
|
||||
75
docs/snippets/elixir/plugins/quality_score_validator.exs
Normal file
75
docs/snippets/elixir/plugins/quality_score_validator.exs
Normal file
@@ -0,0 +1,75 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
|
||||
# Quality Score Validator Plugin
|
||||
# This validator ensures extracted content meets a minimum quality threshold.
|
||||
# It checks the quality_score metadata field and rejects low-quality extractions.
|
||||
|
||||
defmodule MyApp.Plugins.QualityScoreValidator do
|
||||
@behaviour Kreuzberg.Plugin.Validator
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"QualityScoreValidator"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def priority do
|
||||
100
|
||||
end
|
||||
|
||||
@impl true
|
||||
def should_validate?(result) do
|
||||
true
|
||||
end
|
||||
|
||||
@impl true
|
||||
def validate(result) do
|
||||
# Extract quality score from ExtractionResult struct
|
||||
quality_score = result.quality_score
|
||||
|
||||
if is_number(quality_score) and quality_score >= 0.5 do
|
||||
:ok
|
||||
else
|
||||
{:error, "Quality score too low: #{Float.round(quality_score, 2)}. Minimum required: 0.5"}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register the quality validator plugin
|
||||
Plugin.register_validator(MyApp.Plugins.QualityScoreValidator)
|
||||
|
||||
# Example usage with extraction
|
||||
# Note: In real usage, result will be an ExtractionResult struct, not a map.
|
||||
# This example shows the data structure for illustration purposes.
|
||||
result = %{
|
||||
"content" => "Extracted document content",
|
||||
"metadata" => %{
|
||||
"quality_score" => 0.85,
|
||||
"pages" => 1
|
||||
}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.QualityScoreValidator.validate(result) do
|
||||
:ok ->
|
||||
IO.puts("Quality validation passed: #{result["metadata"]["quality_score"]}")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Quality validation failed: #{reason}")
|
||||
end
|
||||
|
||||
# List all registered validators
|
||||
{:ok, validators} = Plugin.list_validators()
|
||||
IO.inspect(validators, label: "Registered Validators")
|
||||
```
|
||||
48
docs/snippets/elixir/plugins/quality_score_validator.md
Normal file
48
docs/snippets/elixir/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,48 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Quality score validator implementation is not available in the Elixir binding. Custom validators must be implemented in Rust using the `Validator` trait.
|
||||
|
||||
To implement a quality score validator in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{Plugin, Validator};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct QualityScoreValidator {
|
||||
min_score: f32,
|
||||
}
|
||||
|
||||
impl Plugin for QualityScoreValidator {
|
||||
fn name(&self) -> &str { "quality-validator" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Validator for QualityScoreValidator {
|
||||
async fn validate(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig,
|
||||
) -> Result<()> {
|
||||
let quality = calculate_quality_score(result);
|
||||
if quality < self.min_score {
|
||||
return Err(KreuzbergError::validation(format!(
|
||||
"Quality score too low: {} < {}",
|
||||
quality, self.min_score
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn priority(&self) -> i32 { 50 }
|
||||
}
|
||||
|
||||
fn calculate_quality_score(result: &ExtractionResult) -> f32 {
|
||||
// Implement quality scoring logic
|
||||
0.8
|
||||
}
|
||||
```
|
||||
|
||||
Register this validator in Rust and Elixir will use it automatically.
|
||||
45
docs/snippets/elixir/plugins/stateful_plugin.md
Normal file
45
docs/snippets/elixir/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,45 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Stateful plugin implementation is not available in the Elixir binding. Stateful plugins must be implemented in Rust using `Arc<Mutex<>>` or `Arc<RwLock<>>` for thread-safe state management.
|
||||
|
||||
To implement a stateful plugin in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct StatefulProcessor {
|
||||
state: Arc<Mutex<ProcessorState>>,
|
||||
}
|
||||
|
||||
struct ProcessorState {
|
||||
extraction_count: usize,
|
||||
}
|
||||
|
||||
impl Plugin for StatefulProcessor {
|
||||
fn name(&self) -> &str { "stateful-processor" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.extraction_count = 0;
|
||||
Ok(())
|
||||
}
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for StatefulProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
_result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.extraction_count += 1;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Register this in Rust and use it from Elixir.
|
||||
20
docs/snippets/elixir/plugins/unregister_plugins.md
Normal file
20
docs/snippets/elixir/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Plugin unregistration is not available in the Elixir binding. Plugin unregistration must be done in Rust using the registry APIs.
|
||||
|
||||
To unregister a specific plugin in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::registry::get_document_extractor_registry;
|
||||
|
||||
let registry = get_document_extractor_registry();
|
||||
registry.remove("custom-json-extractor")?;
|
||||
```
|
||||
|
||||
In Elixir, you can only clear all plugins of a specific type using:
|
||||
|
||||
- `Kreuzberg.clear_document_extractors()`
|
||||
- `Kreuzberg.clear_post_processors()`
|
||||
- `Kreuzberg.clear_ocr_backends()`
|
||||
- `Kreuzberg.clear_validators()`
|
||||
|
||||
To remove a single plugin, you must do so from the Rust core before Elixir starts using it.
|
||||
77
docs/snippets/elixir/plugins/word_count_processor.exs
Normal file
77
docs/snippets/elixir/plugins/word_count_processor.exs
Normal file
@@ -0,0 +1,77 @@
|
||||
```elixir title="Elixir"
|
||||
alias Kreuzberg.Plugin
|
||||
|
||||
# Word Count Post-Processor Plugin
|
||||
# This post-processor automatically counts words in extracted content
|
||||
# and adds the word count to the metadata.
|
||||
|
||||
defmodule MyApp.Plugins.WordCountProcessor do
|
||||
@behaviour Kreuzberg.Plugin.PostProcessor
|
||||
require Logger
|
||||
|
||||
@impl true
|
||||
def name do
|
||||
"WordCountProcessor"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def processing_stage do
|
||||
:post
|
||||
end
|
||||
|
||||
@impl true
|
||||
def version do
|
||||
"1.0.0"
|
||||
end
|
||||
|
||||
@impl true
|
||||
def initialize do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def shutdown do
|
||||
:ok
|
||||
end
|
||||
|
||||
@impl true
|
||||
def process(result, _options) do
|
||||
content = result["content"] || ""
|
||||
word_count = content
|
||||
|> String.split(~r/\s+/, trim: true)
|
||||
|> length()
|
||||
|
||||
# Update metadata with word count
|
||||
metadata = Map.get(result, "metadata", %{})
|
||||
updated_metadata = Map.put(metadata, "word_count", word_count)
|
||||
|
||||
{:ok, Map.put(result, "metadata", updated_metadata)}
|
||||
end
|
||||
end
|
||||
|
||||
# Register the word count post-processor
|
||||
Plugin.register_post_processor(:word_count_processor, MyApp.Plugins.WordCountProcessor)
|
||||
|
||||
# Example usage
|
||||
result = %{
|
||||
"content" => "The quick brown fox jumps over the lazy dog. This is a sample document with multiple words.",
|
||||
"metadata" => %{
|
||||
"source" => "document.pdf",
|
||||
"pages" => 1
|
||||
}
|
||||
}
|
||||
|
||||
case MyApp.Plugins.WordCountProcessor.process(result, %{}) do
|
||||
{:ok, processed_result} ->
|
||||
word_count = processed_result["metadata"]["word_count"]
|
||||
IO.puts("Word count added: #{word_count} words")
|
||||
IO.inspect(processed_result, label: "Processed Result")
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts("Processing failed: #{reason}")
|
||||
end
|
||||
|
||||
# List all registered post-processors
|
||||
{:ok, processors} = Plugin.list_post_processors()
|
||||
IO.inspect(processors, label: "Registered Post-Processors")
|
||||
```
|
||||
46
docs/snippets/elixir/plugins/word_count_processor.md
Normal file
46
docs/snippets/elixir/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,46 @@
|
||||
<!-- snippet:skip reason="Elixir Rustler NIFs cannot host async Send + Sync + 'static Rust trait objects via callbacks; the BEAM actor-model boundary requires plugin work to live in the Rust core. The alef-generated Elixir trait_call macro additionally has a backslash/encoding bug (separate alef-codegen ticket). Custom plugins must be implemented in Rust." -->
|
||||
Custom post-processor implementation is not available in the Elixir binding. Post-processors must be implemented in Rust using the `PostProcessor` trait.
|
||||
|
||||
To implement a word count processor in Rust:
|
||||
|
||||
```rust
|
||||
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
||||
use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
||||
use async_trait::async_trait;
|
||||
|
||||
struct WordCountProcessor;
|
||||
|
||||
impl Plugin for WordCountProcessor {
|
||||
fn name(&self) -> &str { "word-count" }
|
||||
fn version(&self) -> String { "1.0.0".to_string() }
|
||||
fn initialize(&self) -> Result<()> { Ok(()) }
|
||||
fn shutdown(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PostProcessor for WordCountProcessor {
|
||||
async fn process(
|
||||
&self,
|
||||
result: &mut ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> Result<()> {
|
||||
let word_count = result.content.split_whitespace().count();
|
||||
// Store word count in metadata or processing warnings
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::Early
|
||||
}
|
||||
|
||||
fn should_process(
|
||||
&self,
|
||||
result: &ExtractionResult,
|
||||
_config: &ExtractionConfig
|
||||
) -> bool {
|
||||
!result.content.is_empty()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Register this processor in Rust and it will be applied during extraction in Elixir.
|
||||
Reference in New Issue
Block a user