Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
# Utility functions for working with chunks
defmodule ChunkingUtils do
@doc """
Calculate statistics for a list of chunks.
Returns min, max, and average chunk sizes.
"""
def chunk_size_stats(chunks) when is_list(chunks) do
case chunks do
[] ->
%{min: 0, max: 0, avg: 0}
chunks ->
sizes = Enum.map(chunks, &chunk_text_length/1)
%{
min: Enum.min(sizes),
max: Enum.max(sizes),
avg: div(Enum.sum(sizes), length(sizes)),
count: length(chunks)
}
end
end
@doc """
Filter chunks by a minimum size threshold.
"""
def filter_by_min_size(chunks, min_size) do
Enum.filter(chunks, &(chunk_text_length(&1) >= min_size))
end
@doc """
Merge adjacent chunks if they are below a size threshold.
"""
def merge_small_chunks(chunks, threshold) do
chunks
|> Enum.reduce([], fn chunk, acc ->
case acc do
[] ->
[chunk]
[last | rest] ->
last_size = chunk_text_length(last)
if last_size < threshold do
merged_text = "#{last["content"]} #{chunk["content"]}"
merged_chunk = Map.put(chunk, "content", merged_text)
[merged_chunk | rest]
else
[chunk, last | rest]
end
end
end)
|> Enum.reverse()
end
@doc """
Group chunks by document section (if metadata contains section info).
"""
def group_by_section(chunks) do
Enum.group_by(chunks, fn chunk ->
Map.get(chunk, "metadata", %{})
|> Map.get("section", "general")
end)
end
# Private helper
defp chunk_text_length(chunk) do
chunk
|> Map.get("content", "")
|> String.length()
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("doc.pdf", nil, config)
chunks = result.chunks || []
IO.puts("=== Chunk Statistics ===")
IO.inspect(ChunkingUtils.chunk_size_stats(chunks))
IO.puts("\n=== Chunks by Section ===")
IO.inspect(ChunkingUtils.group_by_section(chunks))
IO.puts("\n=== Filtering chunks >= 500 chars ===")
IO.inspect(ChunkingUtils.filter_by_min_size(chunks, 500))
IO.puts("\n=== Merging small chunks < 200 chars ===")
IO.inspect(ChunkingUtils.merge_small_chunks(chunks, 200))

View File

@@ -0,0 +1,31 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"chunking" => %{
"max_characters" => 500,
"overlap" => 50,
"embedding" => %{
"model" => %{"preset" => %{"name" => "balanced"}},
"normalize" => true,
"batch_size" => 16
}
}
})
{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
result = Jason.decode!(json)
chunks_with_embeddings =
for chunk <- result["chunks"] || [],
embedding = chunk["embedding"],
is_list(embedding) do
%{
content: String.slice(chunk["content"] || "", 0, 100),
embedding_dims: length(embedding)
}
end
IO.puts("Chunks with embeddings: #{length(chunks_with_embeddings)}")
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"chunking" => %{
"max_characters" => 1024,
"overlap" => 100,
"embedding" => %{
"model" => %{"preset" => %{"name" => "balanced"}},
"normalize" => true,
"batch_size" => 32,
"show_download_progress" => false
}
}
})
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
result = Jason.decode!(json)
IO.puts("chunks: #{length(result["chunks"] || [])}")
```

View File

@@ -0,0 +1,155 @@
# Error handling patterns for Kreuzberg extraction
defmodule ErrorHandlingUtils do
@doc """
Safely extract a file with retry logic and error recovery.
"""
def extract_with_retry(file_path, config, max_retries \\ 3) do
extract_with_retry(file_path, config, max_retries, 1, nil)
end
defp extract_with_retry(_file_path, _config, max_retries, attempt, _error)
when attempt > max_retries do
{:error, "Max retries (#{max_retries}) exceeded"}
end
defp extract_with_retry(file_path, config, max_retries, attempt, _prev_error) do
case Kreuzberg.extract_file(file_path, nil, config) do
{:ok, result} ->
{:ok, result}
{:error, reason} ->
IO.puts("Attempt #{attempt} failed: #{inspect(reason)}")
Process.sleep(100 * attempt) # Exponential backoff
extract_with_retry(file_path, config, max_retries, attempt + 1, reason)
end
end
@doc """
Extract multiple files and collect results and errors separately.
"""
def extract_multiple(files, config) do
files
|> Enum.map(fn file ->
{file, Kreuzberg.extract_file(file, nil, config)}
end)
|> Enum.reduce(%{successes: [], failures: []}, fn {file, result}, acc ->
case result do
{:ok, data} ->
Map.update!(acc, :successes, &[{file, data} | &1])
{:error, reason} ->
Map.update!(acc, :failures, &[{file, reason} | &1])
end
end)
|> then(fn acc ->
%{
acc
| successes: Enum.reverse(acc.successes),
failures: Enum.reverse(acc.failures)
}
end)
end
@doc """
Validate extraction result and return detailed error information.
"""
def validate_result(result, required_fields \\ ["text", "metadata"]) do
case result do
{:ok, data} ->
missing = Enum.filter(required_fields, &(!Map.has_key?(data, &1)))
if Enum.empty?(missing) do
{:ok, data}
else
{:error, "Missing required fields: #{inspect(missing)}"}
end
{:error, reason} ->
{:error, format_error(reason)}
end
end
@doc """
Format errors into human-readable messages.
"""
def format_error(reason) when is_binary(reason), do: reason
def format_error(reason) when is_atom(reason) do
case reason do
:file_not_found -> "The specified file could not be found"
:invalid_format -> "The file format is not supported"
:extraction_failed -> "Failed to extract content from the file"
:timeout -> "Extraction operation timed out"
:permission_denied -> "Permission denied when accessing the file"
other -> "Unknown error: #{inspect(other)}"
end
end
def format_error(reason), do: inspect(reason)
@doc """
Log extraction metrics for debugging and monitoring.
"""
def log_metrics(file_path, result, duration_ms) do
status =
case result do
{:ok, _} -> "success"
{:error, _} -> "failure"
end
IO.puts("""
[#{DateTime.utc_now()}] Extraction Metrics
- File: #{file_path}
- Status: #{status}
- Duration: #{duration_ms}ms
""")
case result do
{:ok, data} ->
IO.puts("- Chunks: #{length(data.chunks || [])}")
IO.puts("- Text length: #{String.length(data.content || "")}")
{:error, reason} ->
IO.puts("- Error: #{format_error(reason)}")
end
end
end
# Example usage with error handling
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
IO.puts("=== Extract with Retry ===")
case ErrorHandlingUtils.extract_with_retry("document.pdf", config, 3) do
{:ok, result} ->
IO.puts("Extraction succeeded")
IO.inspect(result)
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
IO.puts("\n=== Extract Multiple Files ===")
files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
case ErrorHandlingUtils.extract_multiple(files, config) do
results ->
IO.puts("Successes: #{length(results.successes)}")
IO.puts("Failures: #{length(results.failures)}")
IO.inspect(results)
end
IO.puts("\n=== Validate Result ===")
{:ok, result} = Kreuzberg.extract_file("test.pdf", nil, config)
case ErrorHandlingUtils.validate_result(result, ["text", "chunks"]) do
{:ok, data} ->
IO.puts("Validation passed")
{:error, reason} ->
IO.puts("Validation failed: #{reason}")
end

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"keywords" => %{
"algorithm" => "yake",
"max_keywords" => 10,
"min_score" => 0.3
}
})
{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
result = Jason.decode!(json)
for keyword <- result["extracted_keywords"] || [] do
text = keyword["text"] || ""
score = keyword["score"] || 0.0
IO.puts("#{text}: #{:io_lib.format("~.3f", [score])}")
end
```

View File

@@ -0,0 +1,187 @@
# Language detection and processing utilities
defmodule LanguageDetectionUtils do
@default_confidence 0.8
@language_indicators %{
"en" => %{
patterns: ~w[the and to of a in is that],
common_words: ~w[english language detection],
min_match: 2
},
"es" => %{
patterns: ~w[de la el y en que],
common_words: ~w[español detección idioma],
min_match: 2
},
"fr" => %{
patterns: ~w[le la de et une est],
common_words: ~w[français détection langue],
min_match: 2
},
"de" => %{
patterns: ~w[der die und das in ein],
common_words: ~w[deutsch erkennung sprache],
min_match: 2
}
}
@doc """
Detect the language of extracted text using pattern matching.
Returns {language_code, confidence} tuple.
"""
def detect_language(text) when is_binary(text) and byte_size(text) > 0 do
normalized = String.downcase(text)
words = String.split(normalized, ~r/\W+/)
scores =
@language_indicators
|> Enum.map(fn {lang, indicators} ->
matched = count_pattern_matches(words, indicators.patterns)
confidence = min(matched / indicators.min_match, 1.0)
{lang, confidence}
end)
|> Enum.sort_by(&elem(&1, 1), :desc)
case scores do
[{lang, confidence} | _] when confidence > 0.3 ->
{lang, min(confidence * 100, 100)}
_ ->
{"unknown", 0.0}
end
end
def detect_language(_), do: {"unknown", 0.0}
@doc """
Detect language for each chunk and add it to metadata.
"""
def detect_chunk_languages(chunks) when is_list(chunks) do
Enum.map(chunks, fn chunk ->
text = Map.get(chunk, "text", "")
{lang, confidence} = detect_language(text)
metadata = Map.get(chunk, "metadata", %{})
updated_metadata =
metadata
|> Map.put("detected_language", lang)
|> Map.put("language_confidence", confidence)
Map.put(chunk, "metadata", updated_metadata)
end)
end
@doc """
Group chunks by detected language.
"""
def group_chunks_by_language(chunks) do
chunks
|> detect_chunk_languages()
|> Enum.group_by(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.get("detected_language", "unknown")
end)
end
@doc """
Filter chunks by language with optional confidence threshold.
"""
def filter_by_language(chunks, target_language, min_confidence \\ 0.7) do
chunks
|> detect_chunk_languages()
|> Enum.filter(fn chunk ->
metadata = Map.get(chunk, "metadata", %{})
detected = Map.get(metadata, "detected_language", "unknown")
confidence = Map.get(metadata, "language_confidence", 0.0) / 100
detected == target_language and confidence >= min_confidence
end)
end
@doc """
Summarize language distribution across chunks.
"""
def language_summary(chunks) do
chunks
|> group_chunks_by_language()
|> Enum.map(fn {lang, group} ->
avg_confidence =
group
|> Enum.map(fn chunk ->
chunk
|> Map.get("metadata", %{})
|> Map.get("language_confidence", 0.0)
end)
|> then(fn confidences ->
if Enum.empty?(confidences) do
0.0
else
Enum.sum(confidences) / length(confidences)
end
end)
%{
language: lang,
chunk_count: length(group),
avg_confidence: Float.round(avg_confidence, 2)
}
end)
|> Enum.sort_by(&Map.get(&1, :chunk_count), :desc)
end
@doc """
Determine if text is mostly in a single language.
"""
def is_single_language?(chunks, threshold \\ 0.8) do
case language_summary(chunks) do
[top | rest] ->
dominant_ratio = top.chunk_count / Enum.reduce(rest, top.chunk_count, fn x, acc ->
acc + x.chunk_count
end)
dominant_ratio >= threshold
_ ->
false
end
end
# Private helpers
defp count_pattern_matches(words, patterns) do
Enum.count(words, &Enum.member?(patterns, &1))
end
end
# Example usage
config = %Kreuzberg.ExtractionConfig{
chunking: %{"enabled" => true, "max_characters" => 1000}
}
{:ok, result} = Kreuzberg.extract_file("multilingual_doc.pdf", nil, config)
chunks = result.chunks || []
IO.puts("=== Language Detection ===")
case LanguageDetectionUtils.detect_language(result.content || "") do
{lang, confidence} ->
IO.puts("Detected Language: #{lang}")
IO.puts("Confidence: #{Float.round(confidence, 2)}%")
end
IO.puts("\n=== Language Summary ===")
IO.inspect(LanguageDetectionUtils.language_summary(chunks))
IO.puts("\n=== Group by Language ===")
LanguageDetectionUtils.group_chunks_by_language(chunks)
|> Enum.each(fn {lang, group} ->
IO.puts("Language: #{lang} - Chunks: #{length(group)}")
end)
IO.puts("\n=== Filter English Chunks (min 80% confidence) ===")
IO.inspect(LanguageDetectionUtils.filter_by_language(chunks, "en", 0.8))
IO.puts("\n=== Check if Single Language ===")
IO.puts(LanguageDetectionUtils.is_single_language?(chunks, 0.8))

View File

@@ -0,0 +1,17 @@
<!-- snippet:syntax-only -->
```elixir
config_json = Jason.encode!(%{"enable_quality_processing" => true})
{:ok, json} = Kreuzberg.extract_file_async("scanned_document.pdf", nil, config_json)
result = Jason.decode!(json)
quality_score = result["quality_score"] || 0.0
if quality_score < 0.5 do
IO.puts("Warning: Low quality extraction (#{:io_lib.format("~.2f", [quality_score])})")
IO.puts("Consider re-scanning with higher DPI or adjusting OCR settings")
else
IO.puts("Quality score: #{:io_lib.format("~.2f", [quality_score])}")
end
```

View File

@@ -0,0 +1,11 @@
```elixir
# Embed with default config
{:ok, embeddings} = Kreuzberg.embed(["Hello world", "How are you?"])
# Embed with specific preset
config = %Kreuzberg.EmbeddingConfig{model: {:preset, "fast"}}
{:ok, embeddings} = Kreuzberg.embed(["Hello world"], config)
# Raise on error
embeddings = Kreuzberg.embed!(["Hello world"])
```

View File

@@ -0,0 +1,15 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
}
})
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
result = Jason.decode!(json)
IO.puts("Content length: #{String.length(result["content"] || "")}")
```

View File

@@ -0,0 +1,22 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"token_reduction" => %{
"mode" => "moderate",
"preserve_important_words" => true
}
})
{:ok, json} = Kreuzberg.extract_file_async("verbose_document.pdf", nil, config_json)
result = Jason.decode!(json)
metadata = result["metadata"] || %{}
original = metadata["original_token_count"] || 0
reduced = metadata["token_count"] || 0
ratio = metadata["token_reduction_ratio"] || 0.0
IO.puts("Reduced from #{original} to #{reduced} tokens")
IO.puts("Reduction: #{:io_lib.format("~.1f", [ratio * 100])}%")
```

View File

@@ -0,0 +1,26 @@
<!-- snippet:syntax-only -->
```elixir
config_json =
Jason.encode!(%{
"chunking" => %{
"max_characters" => 512,
"overlap" => 50,
"embedding" => %{
"model" => %{"preset" => %{"name" => "balanced"}},
"normalize" => true
}
}
})
{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
result = Jason.decode!(json)
(result["chunks"] || [])
|> Enum.with_index()
|> Enum.each(fn {chunk, i} ->
chunk_id = "doc_chunk_#{i}"
preview = String.slice(chunk["content"] || "", 0, 50)
IO.puts("Chunk #{chunk_id}: #{preview}")
end)
```