Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
```elixir title="Elixir"
# Extract from multiple binary inputs in a batch operation
# Useful for processing multiple documents in memory
# Prepare binary data from multiple sources
{:ok, pdf_data_1} = File.read("document1.pdf")
{:ok, pdf_data_2} = File.read("document2.pdf")
{:ok, pdf_data_3} = File.read("document3.pdf")
data_list = [pdf_data_1, pdf_data_2, pdf_data_3]
# Option 1: Use single MIME type for all inputs
{:ok, results} = Kreuzberg.batch_extract_bytes(data_list, "application/pdf")
# Process results
Enum.each(results, fn result ->
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total documents processed: #{length(results)}")
# Option 2: Use different MIME types for each input
mime_types = ["application/pdf", "text/html", "application/pdf"]
{:ok, mixed_results} = Kreuzberg.batch_extract_bytes(data_list, mime_types)
# Option 3: Batch extraction with configuration
config = %Kreuzberg.ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"},
extract_images: true
}
case Kreuzberg.batch_extract_bytes(data_list, "application/pdf", config) do
{:ok, results} ->
IO.puts("Successfully extracted #{length(results)} documents")
Enum.each(results, fn result ->
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
end)
{:error, reason} ->
IO.puts("Batch extraction failed: #{reason}")
end
# Option 4: Using the bang variant (raises on error)
try do
results = Kreuzberg.batch_extract_bytes!(data_list, "application/pdf")
IO.puts("Extracted #{length(results)} documents successfully")
rescue
error in Kreuzberg.Error ->
IO.puts("Error: #{error.message}")
end
```

View File

@@ -0,0 +1,14 @@
```elixir title="Elixir"
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
Enum.each(results, fn result ->
IO.puts("File: #{result.mime_type}")
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Tables: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total files processed: #{length(results)}")
```

View File

@@ -0,0 +1,163 @@
```elixir title="Elixir"
# Client wrapper for batch file extraction
# Provides error handling, logging, and result aggregation
defmodule BatchDocumentClient do
@moduledoc """
Client wrapper for batch file document extraction.
Handles multiple files with comprehensive error handling and logging.
"""
alias Kreuzberg.ExtractionResult
@doc """
Extract content from multiple files in batch.
Returns results for all successfully processed files and logs errors
for any files that fail during extraction.
## Options
* `:mime_type` - MIME type for all files (optional, defaults to auto-detection)
* `:config` - ExtractionConfig struct with options (optional)
* `:log_errors` - Whether to log extraction errors (default: true)
* `:fail_fast` - Stop on first error (default: false)
## Examples
{:ok, results} = BatchDocumentClient.extract_files(
["doc1.pdf", "doc2.pdf", "doc3.pdf"],
mime_type: "application/pdf"
)
"""
@spec extract_files([String.t()], keyword()) ::
{:ok, [ExtractionResult.t()]} | {:error, String.t()}
def extract_files(paths, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
log_errors = Keyword.get(opts, :log_errors, true)
case Kreuzberg.batch_extract_files(paths, mime_type, config) do
{:ok, results} ->
IO.debug("Successfully extracted #{length(results)} files")
{:ok, results}
{:error, reason} ->
if log_errors do
IO.debug("Batch extraction error: #{reason}")
end
{:error, reason}
end
end
@doc """
Extract files and return detailed statistics.
Returns aggregated metrics about all processed files.
"""
@spec extract_files_with_stats([String.t()], keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_files_with_stats(paths, opts \\ []) do
start_time = System.monotonic_time(:millisecond)
case extract_files(paths, opts) do
{:ok, results} ->
elapsed_ms = System.monotonic_time(:millisecond) - start_time
stats = %{
total_files: length(results),
total_content_size: Enum.reduce(results, 0, &(byte_size(&1.content) + &2)),
total_tables: Enum.reduce(results, 0, &(length(&1.tables) + &2)),
total_images: Enum.reduce(results, 0, &(length(&1.images || []) + &2)),
processing_time_ms: elapsed_ms,
avg_time_per_file_ms: div(elapsed_ms, max(length(results), 1)),
results: results
}
{:ok, stats}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract files and transform results.
Applies a transformation function to each extraction result.
Useful for custom processing or formatting of results.
"""
@spec extract_and_transform([String.t()], function(), keyword()) ::
{:ok, [any()]} | {:error, String.t()}
def extract_and_transform(paths, transform_fn, opts \\ []) do
case extract_files(paths, opts) do
{:ok, results} ->
transformed =
results
|> Enum.map(fn result ->
try do
{:ok, transform_fn.(result)}
rescue
error ->
IO.debug("Transform error: #{inspect(error)}")
{:error, error}
end
end)
# Check if any transforms failed
case Enum.find(transformed, fn r -> match?({:error, _}, r) end) do
nil ->
# All succeeded
{:ok, Enum.map(transformed, fn {:ok, value} -> value end)}
{:error, error} ->
{:error, "Transform failed: #{inspect(error)}"}
end
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
# Extract multiple files
case BatchDocumentClient.extract_files(["doc1.pdf", "doc2.pdf", "doc3.pdf"]) do
{:ok, results} ->
Enum.each(results, fn result ->
IO.puts("Extracted: #{byte_size(result.content)} bytes")
end)
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with statistics
case BatchDocumentClient.extract_files_with_stats(["doc1.pdf", "doc2.pdf"]) do
{:ok, stats} ->
IO.puts("Total files: #{stats.total_files}")
IO.puts("Total size: #{stats.total_content_size} bytes")
IO.puts("Processing time: #{stats.processing_time_ms}ms")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract and transform
transform = fn result ->
%{
mime: result.mime_type,
size: byte_size(result.content),
tables: length(result.tables)
}
end
case BatchDocumentClient.extract_and_transform(["doc1.pdf", "doc2.pdf"], transform) do
{:ok, transformed_results} ->
IO.inspect(transformed_results)
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,107 @@
```elixir title="Elixir"
# Reusable client pattern for single file extraction
# Encapsulates extraction logic with consistent error handling
defmodule DocumentClient do
@moduledoc """
Client wrapper for single file document extraction.
Provides a consistent interface for extracting content from files.
"""
alias Kreuzberg.ExtractionResult
@doc """
Extract content from a single file.
Returns a result tuple with the extraction outcome.
Supports explicit MIME type specification or auto-detection.
## Options
* `:mime_type` - MIME type of the file (optional, defaults to auto-detection)
* `:config` - ExtractionConfig struct with options (optional)
## Examples
{:ok, result} = DocumentClient.extract_file("document.pdf")
{:ok, result} = DocumentClient.extract_file("document.pdf", mime_type: "application/pdf")
"""
@spec extract_file(String.t(), keyword()) ::
{:ok, ExtractionResult.t()} | {:error, String.t()}
def extract_file(path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
IO.debug("Successfully extracted file: #{path}")
{:ok, result}
{:error, reason} ->
IO.debug("Failed to extract file: #{path} - #{reason}")
{:error, reason}
end
end
@doc """
Extract content from a file, raising on error.
Raises Kreuzberg.Error if extraction fails.
"""
@spec extract_file!(String.t(), keyword()) :: ExtractionResult.t()
def extract_file!(path, opts \\ []) do
mime_type = Keyword.get(opts, :mime_type, nil)
config = Keyword.get(opts, :config, nil)
Kreuzberg.extract_file!(path, mime_type, config)
end
@doc """
Extract and process file content.
Returns a map with extracted content, metadata, and processing statistics.
"""
@spec extract_with_stats(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_stats(path, opts \\ []) do
start_time = System.monotonic_time(:millisecond)
case extract_file(path, opts) do
{:ok, result} ->
elapsed_ms = System.monotonic_time(:millisecond) - start_time
{:ok,
%{
content: result.content,
mime_type: result.mime_type,
metadata: result.metadata,
table_count: length(result.tables),
image_count: length(result.images || []),
processing_time_ms: elapsed_ms
}}
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
case DocumentClient.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Content length: #{byte_size(result.content)} bytes")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Extract with statistics
case DocumentClient.extract_with_stats("document.pdf") do
{:ok, stats} ->
IO.puts("Processing time: #{stats.processing_time_ms}ms")
IO.puts("Tables found: #{stats.table_count}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,247 @@
```elixir title="Elixir"
# Client wrapper with comprehensive configuration
# Demonstrates advanced extraction patterns with OCR, chunking, and validation
defmodule ConfiguredDocumentClient do
@moduledoc """
Client wrapper for document extraction with advanced configuration.
Supports OCR, chunking, language detection, and custom error handling.
"""
alias Kreuzberg.{ExtractionConfig, ExtractionResult}
@doc """
Extract with OCR enabled for scanned documents.
Uses Tesseract OCR backend for text extraction from images.
"""
@spec extract_with_ocr(String.t() | binary(), keyword()) ::
{:ok, ExtractionResult.t()} | {:error, String.t()}
def extract_with_ocr(input, opts \\ []) do
is_file = is_binary(input) and File.exists?(input)
config = %ExtractionConfig{
ocr: %{
"enabled" => true,
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
},
force_ocr: Keyword.get(opts, :force_ocr, false)
}
mime_type = Keyword.get(opts, :mime_type, nil)
case is_file do
true -> Kreuzberg.extract_file(input, mime_type, config)
false -> Kreuzberg.extract(input, mime_type || "application/pdf", config)
end
end
@doc """
Extract with text chunking for embedding or RAG pipelines.
Splits extracted text into chunks with configurable size and overlap.
"""
@spec extract_with_chunking(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_chunking(path, opts \\ []) do
config = %ExtractionConfig{
chunking: %{
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
}
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
chunks = result.chunks || []
{:ok,
%{
content: result.content,
chunks: chunks,
chunk_count: length(chunks),
metadata: result.metadata
}}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract with language detection and multi-language support.
Detects document languages and can extract from specific languages.
"""
@spec extract_with_language_detection(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_language_detection(path, opts \\ []) do
config = %ExtractionConfig{
language_detection: %{"enabled" => true},
extract_images: true
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
{:ok,
%{
content: result.content,
detected_languages: result.detected_languages || [],
mime_type: result.mime_type,
tables: result.tables,
images: result.images || []
}}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Extract with comprehensive configuration for production use.
Combines OCR, chunking, caching, and language detection with error handling.
"""
@spec extract_with_full_config(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_with_full_config(path, opts \\ []) do
config = %ExtractionConfig{
# OCR settings
ocr: %{
"enabled" => Keyword.get(opts, :ocr_enabled, true),
"backend" => Keyword.get(opts, :ocr_backend, "tesseract")
},
# Chunking for embeddings
chunking: %{
"max_characters" => Keyword.get(opts, :chunk_size, 1000),
"overlap" => Keyword.get(opts, :chunk_overlap, 100)
},
# Language detection
language_detection: %{"enabled" => Keyword.get(opts, :detect_language, true)},
# Cache results
use_cache: Keyword.get(opts, :use_cache, true),
# Extract various content types
extract_images: Keyword.get(opts, :extract_images, true),
extract_tables: true
}
mime_type = Keyword.get(opts, :mime_type, nil)
case Kreuzberg.extract_file(path, mime_type, config) do
{:ok, result} ->
summary = %{
file_path: path,
mime_type: result.mime_type,
content_length: byte_size(result.content),
content_preview: String.slice(result.content, 0..200),
detected_languages: result.detected_languages || [],
table_count: length(result.tables),
image_count: length(result.images || []),
chunk_count: length(result.chunks || []),
metadata: result.metadata
}
{:ok, summary}
{:error, reason} ->
{:error, "Extraction failed: #{reason}"}
end
end
@doc """
Validate file before extraction.
Checks file existence and MIME type compatibility.
"""
@spec validate_file(String.t()) :: :ok | {:error, String.t()}
def validate_file(path) do
cond do
not File.exists?(path) ->
{:error, "File not found: #{path}"}
true ->
case Kreuzberg.detect_mime_type_from_path(path) do
{:ok, _mime_type} ->
:ok
{:error, reason} ->
{:error, "Cannot determine MIME type: #{reason}"}
end
end
end
@doc """
Extract with validation and error recovery.
Validates file before extraction and provides detailed error information.
"""
@spec extract_safely(String.t(), keyword()) ::
{:ok, map()} | {:error, String.t()}
def extract_safely(path, opts \\ []) do
with :ok <- validate_file(path),
{:ok, summary} <- extract_with_full_config(path, opts) do
{:ok, summary}
else
{:error, reason} ->
{:error, reason}
end
end
end
# Usage examples
# Extract with OCR
case ConfiguredDocumentClient.extract_with_ocr("scanned_document.pdf",
ocr_backend: "tesseract"
) do
{:ok, result} ->
IO.puts("OCR extraction successful")
IO.puts("Content: #{String.slice(result.content, 0..100)}...")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with chunking
case ConfiguredDocumentClient.extract_with_chunking("document.pdf",
chunk_size: 500,
chunk_overlap: 50
) do
{:ok, data} ->
IO.puts("Chunks: #{data.chunk_count}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Extract with language detection
case ConfiguredDocumentClient.extract_with_language_detection("multilingual.pdf") do
{:ok, data} ->
IO.puts("Detected languages: #{inspect(data.detected_languages)}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
# Comprehensive extraction with validation
case ConfiguredDocumentClient.extract_safely("document.pdf",
ocr_enabled: true,
detect_language: true,
extract_images: true,
use_cache: true
) do
{:ok, summary} ->
IO.puts("File: #{summary.file_path}")
IO.puts("MIME: #{summary.mime_type}")
IO.puts("Size: #{summary.content_length} bytes")
IO.puts("Tables: #{summary.table_count}")
IO.puts("Languages: #{inspect(summary.detected_languages)}")
{:error, reason} ->
IO.puts("Error: #{reason}")
end
```

View File

@@ -0,0 +1,31 @@
```elixir title="Elixir"
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Successfully extracted content")
IO.puts("Content length: #{byte_size(result.content)} characters")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")
case result do
{:ok, data} ->
IO.puts("File processed successfully")
{:error, error} ->
IO.puts("Error details: #{inspect(error)}")
end
# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
{:error, msg} when is_binary(msg) ->
IO.puts("Validation error: #{msg}")
{:error, reason} ->
IO.puts("Unknown error: #{inspect(reason)}")
end
```

View File

@@ -0,0 +1,13 @@
```elixir title="Elixir"
# Read file into memory
{:ok, file_content} = File.read("document.pdf")
# Extract from bytes/binary data
{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
content = result.content
IO.puts("Extracted content:")
IO.puts(content)
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")
```

View File

@@ -0,0 +1,12 @@
```elixir title="Elixir"
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
```

View File

@@ -0,0 +1,11 @@
```elixir title="Elixir"
{:ok, result} = Kreuzberg.extract_file("document.pdf")
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
```