Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,312 @@
```elixir title="Elixir"
# MCP Custom Client - Connect to Kreuzberg MCP servers
# Demonstrates creating a reusable MCP client for document extraction
defmodule KreuzbergMCPClient do
@moduledoc """
MCP client for communicating with Kreuzberg extraction servers.
Provides methods for extracting documents from remote MCP servers
with support for caching, retry logic, and error handling.
"""
require Logger
defmodule Config do
@moduledoc """
Configuration for MCP client connections.
"""
defstruct [
:host,
:port,
:timeout_ms,
:max_retries,
:retry_delay_ms,
:cache_dir
]
def new(opts \\ []) do
%Config{
host: Keyword.get(opts, :host, "localhost"),
port: Keyword.get(opts, :port, 8080),
timeout_ms: Keyword.get(opts, :timeout_ms, 30000),
max_retries: Keyword.get(opts, :max_retries, 3),
retry_delay_ms: Keyword.get(opts, :retry_delay_ms, 1000),
cache_dir: Keyword.get(opts, :cache_dir, nil)
}
end
end
@doc """
Extract document from file via MCP server.
Sends extraction request to remote Kreuzberg MCP server and returns
structured extraction result with optional caching.
## Options
* `:mime_type` - MIME type of document
* `:config` - Extraction configuration map
* `:use_cache` - Enable result caching (default: false)
"""
@spec extract_file(Config.t(), String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_file(config, file_path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type)
extraction_config = Keyword.get(opts, :config)
use_cache = Keyword.get(opts, :use_cache, false)
# Check cache first
if use_cache and config.cache_dir do
cache_key = compute_cache_key(file_path, mime_type, extraction_config)
case get_from_cache(config.cache_dir, cache_key) do
{:ok, cached_result} ->
Logger.debug("Cache hit for #{file_path}")
{:ok, cached_result}
:miss ->
# Cache miss, fetch from server
case fetch_from_server(config, file_path, mime_type, extraction_config) do
{:ok, result} ->
if use_cache, do: store_in_cache(config.cache_dir, cache_key, result)
{:ok, result}
error ->
error
end
end
else
fetch_from_server(config, file_path, mime_type, extraction_config)
end
end
@doc """
Upload and extract document via MCP server.
Reads file from disk, uploads it to the server, and returns extraction result.
Useful for server-side processing of large files.
"""
@spec upload_and_extract(Config.t(), String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def upload_and_extract(config, file_path, opts \\ []) do
unless File.exists?(file_path) do
{:error, "File not found: #{file_path}"}
else
case File.read(file_path) do
{:ok, body} ->
url = "http://#{config.host}:#{config.port}/extract/file"
headers = [
{"Content-Type", "application/octet-stream"},
{"X-File-Name", Path.basename(file_path)}
]
case HTTPoison.post(url, body, headers, timeout: config.timeout_ms) do
{:ok, response} ->
handle_response(response)
{:error, reason} ->
Logger.error("Upload failed: #{inspect(reason)}")
{:error, "Upload failed: #{inspect(reason)}"}
end
{:error, reason} ->
{:error, "Failed to read file: #{inspect(reason)}"}
end
end
end
@doc """
Check health status of MCP server.
"""
@spec health_check(Config.t()) :: {:ok, map()} | {:error, String.t()}
def health_check(config) do
url = "http://#{config.host}:#{config.port}/health"
case HTTPoison.get(url, [], timeout: config.timeout_ms) do
{:ok, response} ->
case handle_response(response) do
{:ok, data} -> {:ok, data}
error -> error
end
{:error, reason} ->
{:error, "Health check failed: #{inspect(reason)}"}
end
end
@doc """
Batch extract multiple documents with parallel requests.
Sends concurrent extraction requests for better throughput with large
document collections.
"""
@spec batch_extract(Config.t(), [String.t()], keyword()) ::
{:ok, [map()]} | {:error, String.t()}
def batch_extract(config, file_paths, opts \\ []) do
Logger.info("Batch extracting #{length(file_paths)} documents")
results =
file_paths
|> Task.async_stream(fn path ->
extract_file(config, path, opts)
end)
|> Stream.map(fn {:ok, result} -> result end)
|> Enum.to_list()
success_count = Enum.count(results, &match?({:ok, _}, &1))
Logger.info("Batch extraction complete: #{success_count}/#{length(file_paths)} succeeded")
{:ok, results}
end
# Private helpers
defp fetch_from_server(config, file_path, mime_type, extraction_config) do
url = "http://#{config.host}:#{config.port}/extract"
body =
Jason.encode!(%{
file_path: file_path,
mime_type: mime_type,
config: extraction_config
})
headers = [{"Content-Type", "application/json"}]
retry_request(config, fn ->
HTTPoison.post(url, body, headers, timeout: config.timeout_ms)
end)
|> case do
{:ok, response} -> handle_response(response)
error -> error
end
end
defp retry_request(config, request_fn) do
retry_request(config, request_fn, 0)
end
defp retry_request(config, request_fn, attempt) when attempt < config.max_retries do
case request_fn.() do
{:ok, response} ->
{:ok, response}
{:error, reason} ->
Logger.warn("Request failed (attempt #{attempt + 1}): #{inspect(reason)}")
Process.sleep(config.retry_delay_ms)
retry_request(config, request_fn, attempt + 1)
end
end
defp retry_request(_config, _request_fn, _attempt) do
{:error, "Max retries exceeded"}
end
defp handle_response(%HTTPoison.Response{status_code: 200, body: body}) do
case Jason.decode(body) do
{:ok, data} ->
if Map.get(data, "success") do
{:ok, data}
else
{:error, Map.get(data, "error", "Unknown error")}
end
{:error, reason} ->
{:error, "Failed to decode response: #{inspect(reason)}"}
end
end
defp handle_response(%HTTPoison.Response{status_code: status, body: body}) do
{:error, "Server error (#{status}): #{body}"}
end
defp compute_cache_key(file_path, mime_type, config) do
content = "#{file_path}|#{mime_type}|#{inspect(config)}"
:crypto.hash(:sha256, content) |> Base.encode16(case: :lower)
end
defp get_from_cache(cache_dir, cache_key) do
cache_file = Path.join(cache_dir, "#{cache_key}.json")
if File.exists?(cache_file) do
case File.read(cache_file) do
{:ok, content} ->
{:ok, Jason.decode!(content)}
:error ->
:miss
end
else
:miss
end
end
defp store_in_cache(cache_dir, cache_key, result) do
File.mkdir_p!(cache_dir)
cache_file = Path.join(cache_dir, "#{cache_key}.json")
File.write!(cache_file, Jason.encode!(result))
end
end
# Usage examples
IO.puts("=== Kreuzberg MCP Client ===\n")
# Create client configuration
config = KreuzbergMCPClient.Config.new(
host: "localhost",
port: 8080,
timeout_ms: 30000,
max_retries: 3,
cache_dir: "/tmp/kreuzberg_cache"
)
# Check server health
IO.puts("Checking server health...")
case KreuzbergMCPClient.health_check(config) do
{:ok, health} ->
IO.puts("Server status: #{health["status"]}")
IO.puts("Service: #{health["service"]}\n")
{:error, reason} ->
IO.puts("Health check failed: #{reason}\n")
end
# Extract single document
IO.puts("Extracting document...")
case KreuzbergMCPClient.extract_file(config, "document.pdf", use_cache: true) do
{:ok, result} ->
IO.puts("Success!")
IO.puts("Content size: #{byte_size(result["content"])} bytes")
IO.puts("MIME type: #{result["mime_type"]}")
IO.puts("Tables found: #{length(result["tables"])}")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
IO.puts("")
# Batch extract multiple documents
IO.puts("Batch extracting multiple documents...")
documents = [
"doc1.pdf",
"doc2.pdf",
"doc3.pdf"
]
case KreuzbergMCPClient.batch_extract(config, documents) do
{:ok, results} ->
IO.puts("Batch extraction complete!")
successful = Enum.count(results, &match?({:ok, _}, &1))
IO.puts("Successful: #{successful}/#{length(results)}")
{:error, reason} ->
IO.puts("Batch extraction failed: #{reason}")
end
```

View File

@@ -0,0 +1,300 @@
```elixir title="Elixir"
# MCP Server Integration - Start a Kreuzberg MCP server
# Demonstrates how to set up and manage an MCP server for remote document extraction
defmodule KreuzbergMCPServer do
@moduledoc """
MCP (Model Context Protocol) server for Kreuzberg document extraction.
Provides a standardized interface for remote clients to extract documents
using the Kreuzberg library via the Model Context Protocol.
"""
require Logger
alias Kreuzberg.ExtractionConfig
@doc """
Start the MCP server on the specified host and port.
The server accepts extraction requests from MCP clients and returns
structured document data including content, metadata, and extracted elements.
## Options
* `:host` - Server host (default: "127.0.0.1")
* `:port` - Server port (default: 8080)
* `:max_connections` - Maximum concurrent connections (default: 10)
"""
def start_server(opts \\ []) do
host = Keyword.get(opts, :host, "127.0.0.1")
port = Keyword.get(opts, :port, 8080)
max_connections = Keyword.get(opts, :max_connections, 10)
Logger.info("Starting Kreuzberg MCP server on #{host}:#{port}")
{:ok, _pid} =
:cowboy.start_clear(
:kreuzberg_http,
[{:port, port}],
%{
env: [
{:dispatch,
[
{:_,
[
{"/extract", KreuzbergMCPServer.Handler, []},
{"/extract/file", KreuzbergMCPServer.FileHandler, []},
{"/health", KreuzbergMCPServer.HealthHandler, []}
]}
]}
]
}
)
Logger.info("MCP server started successfully")
{:ok, "Server running on #{host}:#{port}"}
end
@doc """
Stop the MCP server gracefully.
"""
def stop_server do
Logger.info("Stopping Kreuzberg MCP server")
:cowboy.stop_listener(:kreuzberg_http)
Logger.info("MCP server stopped")
:ok
end
end
# Handler for extraction requests
defmodule KreuzbergMCPServer.Handler do
@moduledoc """
HTTP handler for MCP extraction requests.
Processes incoming extraction requests with optional configuration.
"""
require Logger
def init(req, state) do
req
|> handle_request()
|> reply()
|> wrap_response(state)
end
defp handle_request(req) do
case req.method do
"POST" -> handle_extraction(req)
_ -> error_response(405, "Method not allowed")
end
end
defp handle_extraction(req) do
case :cowboy_req.read_body(req) do
{:ok, body, req} ->
case Jason.decode(body) do
{:ok, params} ->
extract_from_params(params, req)
{:error, reason} ->
error_response(400, "Invalid JSON: #{inspect(reason)}")
end
{:error, reason} ->
error_response(400, "Failed to read body: #{inspect(reason)}")
end
end
defp extract_from_params(params, req) do
file_path = Map.get(params, "file_path")
mime_type = Map.get(params, "mime_type")
config_opts = Map.get(params, "config", %{})
unless file_path do
error_response(400, "Missing required parameter: file_path")
else
config = build_config(config_opts)
case Kreuzberg.extract_file(file_path, mime_type, config) do
{:ok, result} ->
response_data = %{
success: true,
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata || %{},
tables: result.tables || [],
chunks: result.chunks || [],
images: result.images || [],
detected_languages: result.detected_languages || []
}
success_response(200, response_data, req)
{:error, reason} ->
error_response(400, "Extraction failed: #{inspect(reason)}")
end
end
end
defp build_config(opts) when is_map(opts) do
%Kreuzberg.ExtractionConfig{
ocr: opts["ocr"],
chunking: opts["chunking"],
quality_processing: opts["quality_processing"],
language_detection: opts["language_detection"],
images: opts["images"],
use_cache: Map.get(opts, "use_cache", true)
}
end
defp build_config(_), do: nil
defp success_response(status, data, req) do
{:ok,
:cowboy_req.reply(
status,
%{"content-type" => "application/json"},
Jason.encode!(data),
req
)}
end
defp error_response(status, message) do
{:error,
status,
Jason.encode!(%{
success: false,
error: message
})}
end
defp reply({:ok, req}), do: {req, :ok}
defp reply({:error, status, body}), do: {status, body}
defp wrap_response({req, :ok}, state), do: {:ok, req, state}
defp wrap_response({status, body}, state) do
# Note: In actual implementation, req needs to be passed through the pipeline
# For now, create a minimal request object for error responses
req = :cowboy_req.new()
{:cowboy_req.reply(status, %{}, body, req), state}
end
end
# Health check handler
defmodule KreuzbergMCPServer.HealthHandler do
@moduledoc """
Health check endpoint for the MCP server.
"""
def init(req, state) do
response = Jason.encode!(%{
status: "healthy",
service: "kreuzberg-mcp",
timestamp: DateTime.utc_now() |> DateTime.to_iso8601()
})
req =
:cowboy_req.reply(
200,
%{"content-type" => "application/json"},
response,
req
)
{:ok, req, state}
end
end
# File upload handler
defmodule KreuzbergMCPServer.FileHandler do
@moduledoc """
Handler for multipart file uploads for extraction.
"""
require Logger
def init(req, state) do
case req.method do
"POST" -> handle_file_upload(req, state)
_ -> {:cowboy_req.reply(405, %{}, "Method not allowed", req), state}
end
end
defp handle_file_upload(req, state) do
# Store uploaded file temporarily
temp_path = "/tmp/kreuzberg_#{System.unique_integer([:positive])}"
case :cowboy_req.read_body(req) do
{:ok, body, req} ->
File.write!(temp_path, body)
case Kreuzberg.extract_file(temp_path) do
{:ok, result} ->
response = Jason.encode!(%{
success: true,
content_size: byte_size(result.content),
mime_type: result.mime_type,
metadata: result.metadata
})
req =
:cowboy_req.reply(
200,
%{"content-type" => "application/json"},
response,
req
)
File.rm(temp_path)
{:ok, req, state}
{:error, reason} ->
response = Jason.encode!(%{success: false, error: inspect(reason)})
req =
:cowboy_req.reply(
400,
%{"content-type" => "application/json"},
response,
req
)
File.rm(temp_path)
{:ok, req, state}
end
{:error, reason} ->
response = Jason.encode!(%{success: false, error: inspect(reason)})
req =
:cowboy_req.reply(
400,
%{"content-type" => "application/json"},
response,
req
)
{:ok, req, state}
end
end
end
# Usage example - start the server
IO.puts("=== Kreuzberg MCP Server ===\n")
case KreuzbergMCPServer.start_server(port: 8080) do
{:ok, message} ->
IO.puts(message)
IO.puts("\nServer is running and ready to accept requests:")
IO.puts(" - POST /extract - Extract from file path")
IO.puts(" - POST /extract/file - Upload and extract")
IO.puts(" - GET /health - Health check")
# Keep the server running
IO.puts("\nServer started. Press Ctrl+C to stop.")
Process.sleep(:infinity)
{:error, reason} ->
IO.puts("Failed to start server: #{inspect(reason)}")
end
```