Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/utils/chunking.exs
+++ b/docs/snippets/elixir/utils/chunking.exs
@@ -0,0 +1,92 @@
+# Utility functions for working with chunks
+defmodule ChunkingUtils do
+  @doc """
+  Calculate statistics for a list of chunks.
+  Returns min, max, and average chunk sizes.
+  """
+  def chunk_size_stats(chunks) when is_list(chunks) do
+    case chunks do
+      [] ->
+        %{min: 0, max: 0, avg: 0}
+
+      chunks ->
+        sizes = Enum.map(chunks, &chunk_text_length/1)
+
+        %{
+          min: Enum.min(sizes),
+          max: Enum.max(sizes),
+          avg: div(Enum.sum(sizes), length(sizes)),
+          count: length(chunks)
+        }
+    end
+  end
+
+  @doc """
+  Filter chunks by a minimum size threshold.
+  """
+  def filter_by_min_size(chunks, min_size) do
+    Enum.filter(chunks, &(chunk_text_length(&1) >= min_size))
+  end
+
+  @doc """
+  Merge adjacent chunks if they are below a size threshold.
+  """
+  def merge_small_chunks(chunks, threshold) do
+    chunks
+    |> Enum.reduce([], fn chunk, acc ->
+      case acc do
+        [] ->
+          [chunk]
+
+        [last | rest] ->
+          last_size = chunk_text_length(last)
+
+          if last_size < threshold do
+            merged_text = "#{last["content"]} #{chunk["content"]}"
+            merged_chunk = Map.put(chunk, "content", merged_text)
+            [merged_chunk | rest]
+          else
+            [chunk, last | rest]
+          end
+      end
+    end)
+    |> Enum.reverse()
+  end
+
+  @doc """
+  Group chunks by document section (if metadata contains section info).
+  """
+  def group_by_section(chunks) do
+    Enum.group_by(chunks, fn chunk ->
+      Map.get(chunk, "metadata", %{})
+      |> Map.get("section", "general")
+    end)
+  end
+
+  # Private helper
+  defp chunk_text_length(chunk) do
+    chunk
+    |> Map.get("content", "")
+    |> String.length()
+  end
+end
+
+# Example usage
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{"enabled" => true, "max_characters" => 1000}
+}
+
+{:ok, result} = Kreuzberg.extract_file("doc.pdf", nil, config)
+chunks = result.chunks || []
+
+IO.puts("=== Chunk Statistics ===")
+IO.inspect(ChunkingUtils.chunk_size_stats(chunks))
+
+IO.puts("\n=== Chunks by Section ===")
+IO.inspect(ChunkingUtils.group_by_section(chunks))
+
+IO.puts("\n=== Filtering chunks >= 500 chars ===")
+IO.inspect(ChunkingUtils.filter_by_min_size(chunks, 500))
+
+IO.puts("\n=== Merging small chunks < 200 chars ===")
+IO.inspect(ChunkingUtils.merge_small_chunks(chunks, 200))
--- a/docs/snippets/elixir/utils/chunking_rag.md
+++ b/docs/snippets/elixir/utils/chunking_rag.md
@@ -0,0 +1,31 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "chunking" => %{
+      "max_characters" => 500,
+      "overlap" => 50,
+      "embedding" => %{
+        "model" => %{"preset" => %{"name" => "balanced"}},
+        "normalize" => true,
+        "batch_size" => 16
+      }
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
+result = Jason.decode!(json)
+
+chunks_with_embeddings =
+  for chunk <- result["chunks"] || [],
+      embedding = chunk["embedding"],
+      is_list(embedding) do
+    %{
+      content: String.slice(chunk["content"] || "", 0, 100),
+      embedding_dims: length(embedding)
+    }
+  end
+
+IO.puts("Chunks with embeddings: #{length(chunks_with_embeddings)}")
+```
--- a/docs/snippets/elixir/utils/embedding_with_chunking.md
+++ b/docs/snippets/elixir/utils/embedding_with_chunking.md
@@ -0,0 +1,21 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "chunking" => %{
+      "max_characters" => 1024,
+      "overlap" => 100,
+      "embedding" => %{
+        "model" => %{"preset" => %{"name" => "balanced"}},
+        "normalize" => true,
+        "batch_size" => 32,
+        "show_download_progress" => false
+      }
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
+result = Jason.decode!(json)
+IO.puts("chunks: #{length(result["chunks"] || [])}")
+```
--- a/docs/snippets/elixir/utils/error_handling.exs
+++ b/docs/snippets/elixir/utils/error_handling.exs
@@ -0,0 +1,155 @@
+# Error handling patterns for Kreuzberg extraction
+defmodule ErrorHandlingUtils do
+  @doc """
+  Safely extract a file with retry logic and error recovery.
+  """
+  def extract_with_retry(file_path, config, max_retries \\ 3) do
+    extract_with_retry(file_path, config, max_retries, 1, nil)
+  end
+
+  defp extract_with_retry(_file_path, _config, max_retries, attempt, _error)
+       when attempt > max_retries do
+    {:error, "Max retries (#{max_retries}) exceeded"}
+  end
+
+  defp extract_with_retry(file_path, config, max_retries, attempt, _prev_error) do
+    case Kreuzberg.extract_file(file_path, nil, config) do
+      {:ok, result} ->
+        {:ok, result}
+
+      {:error, reason} ->
+        IO.puts("Attempt #{attempt} failed: #{inspect(reason)}")
+        Process.sleep(100 * attempt)  # Exponential backoff
+        extract_with_retry(file_path, config, max_retries, attempt + 1, reason)
+    end
+  end
+
+  @doc """
+  Extract multiple files and collect results and errors separately.
+  """
+  def extract_multiple(files, config) do
+    files
+    |> Enum.map(fn file ->
+      {file, Kreuzberg.extract_file(file, nil, config)}
+    end)
+    |> Enum.reduce(%{successes: [], failures: []}, fn {file, result}, acc ->
+      case result do
+        {:ok, data} ->
+          Map.update!(acc, :successes, &[{file, data} | &1])
+
+        {:error, reason} ->
+          Map.update!(acc, :failures, &[{file, reason} | &1])
+      end
+    end)
+    |> then(fn acc ->
+      %{
+        acc
+        | successes: Enum.reverse(acc.successes),
+          failures: Enum.reverse(acc.failures)
+      }
+    end)
+  end
+
+  @doc """
+  Validate extraction result and return detailed error information.
+  """
+  def validate_result(result, required_fields \\ ["text", "metadata"]) do
+    case result do
+      {:ok, data} ->
+        missing = Enum.filter(required_fields, &(!Map.has_key?(data, &1)))
+
+        if Enum.empty?(missing) do
+          {:ok, data}
+        else
+          {:error, "Missing required fields: #{inspect(missing)}"}
+        end
+
+      {:error, reason} ->
+        {:error, format_error(reason)}
+    end
+  end
+
+  @doc """
+  Format errors into human-readable messages.
+  """
+  def format_error(reason) when is_binary(reason), do: reason
+
+  def format_error(reason) when is_atom(reason) do
+    case reason do
+      :file_not_found -> "The specified file could not be found"
+      :invalid_format -> "The file format is not supported"
+      :extraction_failed -> "Failed to extract content from the file"
+      :timeout -> "Extraction operation timed out"
+      :permission_denied -> "Permission denied when accessing the file"
+      other -> "Unknown error: #{inspect(other)}"
+    end
+  end
+
+  def format_error(reason), do: inspect(reason)
+
+  @doc """
+  Log extraction metrics for debugging and monitoring.
+  """
+  def log_metrics(file_path, result, duration_ms) do
+    status =
+      case result do
+        {:ok, _} -> "success"
+        {:error, _} -> "failure"
+      end
+
+    IO.puts("""
+    [#{DateTime.utc_now()}] Extraction Metrics
+    - File: #{file_path}
+    - Status: #{status}
+    - Duration: #{duration_ms}ms
+    """)
+
+    case result do
+      {:ok, data} ->
+        IO.puts("- Chunks: #{length(data.chunks || [])}")
+        IO.puts("- Text length: #{String.length(data.content || "")}")
+
+      {:error, reason} ->
+        IO.puts("- Error: #{format_error(reason)}")
+    end
+  end
+end
+
+# Example usage with error handling
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{"enabled" => true, "max_characters" => 1000}
+}
+
+IO.puts("=== Extract with Retry ===")
+
+case ErrorHandlingUtils.extract_with_retry("document.pdf", config, 3) do
+  {:ok, result} ->
+    IO.puts("Extraction succeeded")
+    IO.inspect(result)
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+IO.puts("\n=== Extract Multiple Files ===")
+
+files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
+
+case ErrorHandlingUtils.extract_multiple(files, config) do
+  results ->
+    IO.puts("Successes: #{length(results.successes)}")
+    IO.puts("Failures: #{length(results.failures)}")
+    IO.inspect(results)
+end
+
+IO.puts("\n=== Validate Result ===")
+
+{:ok, result} = Kreuzberg.extract_file("test.pdf", nil, config)
+
+case ErrorHandlingUtils.validate_result(result, ["text", "chunks"]) do
+  {:ok, data} ->
+    IO.puts("Validation passed")
+
+  {:error, reason} ->
+    IO.puts("Validation failed: #{reason}")
+end
--- a/docs/snippets/elixir/utils/keyword_extraction_example.md
+++ b/docs/snippets/elixir/utils/keyword_extraction_example.md
@@ -0,0 +1,21 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "keywords" => %{
+      "algorithm" => "yake",
+      "max_keywords" => 10,
+      "min_score" => 0.3
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("research_paper.pdf", nil, config_json)
+result = Jason.decode!(json)
+
+for keyword <- result["extracted_keywords"] || [] do
+  text = keyword["text"] || ""
+  score = keyword["score"] || 0.0
+  IO.puts("#{text}: #{:io_lib.format("~.3f", [score])}")
+end
+```
--- a/docs/snippets/elixir/utils/language_detection.exs
+++ b/docs/snippets/elixir/utils/language_detection.exs
@@ -0,0 +1,187 @@
+# Language detection and processing utilities
+defmodule LanguageDetectionUtils do
+  @default_confidence 0.8
+
+  @language_indicators %{
+    "en" => %{
+      patterns: ~w[the and to of a in is that],
+      common_words: ~w[english language detection],
+      min_match: 2
+    },
+    "es" => %{
+      patterns: ~w[de la el y en que],
+      common_words: ~w[español detección idioma],
+      min_match: 2
+    },
+    "fr" => %{
+      patterns: ~w[le la de et une est],
+      common_words: ~w[français détection langue],
+      min_match: 2
+    },
+    "de" => %{
+      patterns: ~w[der die und das in ein],
+      common_words: ~w[deutsch erkennung sprache],
+      min_match: 2
+    }
+  }
+
+  @doc """
+  Detect the language of extracted text using pattern matching.
+  Returns {language_code, confidence} tuple.
+  """
+  def detect_language(text) when is_binary(text) and byte_size(text) > 0 do
+    normalized = String.downcase(text)
+    words = String.split(normalized, ~r/\W+/)
+
+    scores =
+      @language_indicators
+      |> Enum.map(fn {lang, indicators} ->
+        matched = count_pattern_matches(words, indicators.patterns)
+        confidence = min(matched / indicators.min_match, 1.0)
+        {lang, confidence}
+      end)
+      |> Enum.sort_by(&elem(&1, 1), :desc)
+
+    case scores do
+      [{lang, confidence} | _] when confidence > 0.3 ->
+        {lang, min(confidence * 100, 100)}
+
+      _ ->
+        {"unknown", 0.0}
+    end
+  end
+
+  def detect_language(_), do: {"unknown", 0.0}
+
+  @doc """
+  Detect language for each chunk and add it to metadata.
+  """
+  def detect_chunk_languages(chunks) when is_list(chunks) do
+    Enum.map(chunks, fn chunk ->
+      text = Map.get(chunk, "text", "")
+      {lang, confidence} = detect_language(text)
+
+      metadata = Map.get(chunk, "metadata", %{})
+
+      updated_metadata =
+        metadata
+        |> Map.put("detected_language", lang)
+        |> Map.put("language_confidence", confidence)
+
+      Map.put(chunk, "metadata", updated_metadata)
+    end)
+  end
+
+  @doc """
+  Group chunks by detected language.
+  """
+  def group_chunks_by_language(chunks) do
+    chunks
+    |> detect_chunk_languages()
+    |> Enum.group_by(fn chunk ->
+      chunk
+      |> Map.get("metadata", %{})
+      |> Map.get("detected_language", "unknown")
+    end)
+  end
+
+  @doc """
+  Filter chunks by language with optional confidence threshold.
+  """
+  def filter_by_language(chunks, target_language, min_confidence \\ 0.7) do
+    chunks
+    |> detect_chunk_languages()
+    |> Enum.filter(fn chunk ->
+      metadata = Map.get(chunk, "metadata", %{})
+      detected = Map.get(metadata, "detected_language", "unknown")
+      confidence = Map.get(metadata, "language_confidence", 0.0) / 100
+
+      detected == target_language and confidence >= min_confidence
+    end)
+  end
+
+  @doc """
+  Summarize language distribution across chunks.
+  """
+  def language_summary(chunks) do
+    chunks
+    |> group_chunks_by_language()
+    |> Enum.map(fn {lang, group} ->
+      avg_confidence =
+        group
+        |> Enum.map(fn chunk ->
+          chunk
+          |> Map.get("metadata", %{})
+          |> Map.get("language_confidence", 0.0)
+        end)
+        |> then(fn confidences ->
+          if Enum.empty?(confidences) do
+            0.0
+          else
+            Enum.sum(confidences) / length(confidences)
+          end
+        end)
+
+      %{
+        language: lang,
+        chunk_count: length(group),
+        avg_confidence: Float.round(avg_confidence, 2)
+      }
+    end)
+    |> Enum.sort_by(&Map.get(&1, :chunk_count), :desc)
+  end
+
+  @doc """
+  Determine if text is mostly in a single language.
+  """
+  def is_single_language?(chunks, threshold \\ 0.8) do
+    case language_summary(chunks) do
+      [top | rest] ->
+        dominant_ratio = top.chunk_count / Enum.reduce(rest, top.chunk_count, fn x, acc ->
+          acc + x.chunk_count
+        end)
+
+        dominant_ratio >= threshold
+
+      _ ->
+        false
+    end
+  end
+
+  # Private helpers
+  defp count_pattern_matches(words, patterns) do
+    Enum.count(words, &Enum.member?(patterns, &1))
+  end
+end
+
+# Example usage
+config = %Kreuzberg.ExtractionConfig{
+  chunking: %{"enabled" => true, "max_characters" => 1000}
+}
+
+{:ok, result} = Kreuzberg.extract_file("multilingual_doc.pdf", nil, config)
+chunks = result.chunks || []
+
+IO.puts("=== Language Detection ===")
+
+case LanguageDetectionUtils.detect_language(result.content || "") do
+  {lang, confidence} ->
+    IO.puts("Detected Language: #{lang}")
+    IO.puts("Confidence: #{Float.round(confidence, 2)}%")
+end
+
+IO.puts("\n=== Language Summary ===")
+IO.inspect(LanguageDetectionUtils.language_summary(chunks))
+
+IO.puts("\n=== Group by Language ===")
+
+LanguageDetectionUtils.group_chunks_by_language(chunks)
+|> Enum.each(fn {lang, group} ->
+  IO.puts("Language: #{lang} - Chunks: #{length(group)}")
+end)
+
+IO.puts("\n=== Filter English Chunks (min 80% confidence) ===")
+IO.inspect(LanguageDetectionUtils.filter_by_language(chunks, "en", 0.8))
+
+IO.puts("\n=== Check if Single Language ===")
+IO.puts(LanguageDetectionUtils.is_single_language?(chunks, 0.8))
--- a/docs/snippets/elixir/utils/quality_processing_example.md
+++ b/docs/snippets/elixir/utils/quality_processing_example.md
@@ -0,0 +1,17 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json = Jason.encode!(%{"enable_quality_processing" => true})
+
+{:ok, json} = Kreuzberg.extract_file_async("scanned_document.pdf", nil, config_json)
+result = Jason.decode!(json)
+
+quality_score = result["quality_score"] || 0.0
+
+if quality_score < 0.5 do
+  IO.puts("Warning: Low quality extraction (#{:io_lib.format("~.2f", [quality_score])})")
+  IO.puts("Consider re-scanning with higher DPI or adjusting OCR settings")
+else
+  IO.puts("Quality score: #{:io_lib.format("~.2f", [quality_score])}")
+end
+```
--- a/docs/snippets/elixir/utils/standalone_embed.md
+++ b/docs/snippets/elixir/utils/standalone_embed.md
@@ -0,0 +1,11 @@
+```elixir
+# Embed with default config
+{:ok, embeddings} = Kreuzberg.embed(["Hello world", "How are you?"])
+
+# Embed with specific preset
+config = %Kreuzberg.EmbeddingConfig{model: {:preset, "fast"}}
+{:ok, embeddings} = Kreuzberg.embed(["Hello world"], config)
+
+# Raise on error
+embeddings = Kreuzberg.embed!(["Hello world"])
+```
--- a/docs/snippets/elixir/utils/token_reduction.md
+++ b/docs/snippets/elixir/utils/token_reduction.md
@@ -0,0 +1,15 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "token_reduction" => %{
+      "mode" => "moderate",
+      "preserve_important_words" => true
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
+result = Jason.decode!(json)
+IO.puts("Content length: #{String.length(result["content"] || "")}")
+```
--- a/docs/snippets/elixir/utils/token_reduction_example.md
+++ b/docs/snippets/elixir/utils/token_reduction_example.md
@@ -0,0 +1,22 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "token_reduction" => %{
+      "mode" => "moderate",
+      "preserve_important_words" => true
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("verbose_document.pdf", nil, config_json)
+result = Jason.decode!(json)
+metadata = result["metadata"] || %{}
+
+original = metadata["original_token_count"] || 0
+reduced = metadata["token_count"] || 0
+ratio = metadata["token_reduction_ratio"] || 0.0
+
+IO.puts("Reduced from #{original} to #{reduced} tokens")
+IO.puts("Reduction: #{:io_lib.format("~.1f", [ratio * 100])}%")
+```
--- a/docs/snippets/elixir/utils/vector_database_integration.md
+++ b/docs/snippets/elixir/utils/vector_database_integration.md
@@ -0,0 +1,26 @@
+<!-- snippet:syntax-only -->
+
+```elixir
+config_json =
+  Jason.encode!(%{
+    "chunking" => %{
+      "max_characters" => 512,
+      "overlap" => 50,
+      "embedding" => %{
+        "model" => %{"preset" => %{"name" => "balanced"}},
+        "normalize" => true
+      }
+    }
+  })
+
+{:ok, json} = Kreuzberg.extract_file_async("document.pdf", nil, config_json)
+result = Jason.decode!(json)
+
+(result["chunks"] || [])
+|> Enum.with_index()
+|> Enum.each(fn {chunk, i} ->
+  chunk_id = "doc_chunk_#{i}"
+  preview = String.slice(chunk["content"] || "", 0, 50)
+  IO.puts("Chunk #{chunk_id}: #{preview}")
+end)
+```