Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/core/batch_extract_bytes_sync.exs
+++ b/docs/snippets/elixir/core/batch_extract_bytes_sync.exs
@@ -0,0 +1,54 @@
+```elixir title="Elixir"
+# Extract from multiple binary inputs in a batch operation
+# Useful for processing multiple documents in memory
+
+# Prepare binary data from multiple sources
+{:ok, pdf_data_1} = File.read("document1.pdf")
+{:ok, pdf_data_2} = File.read("document2.pdf")
+{:ok, pdf_data_3} = File.read("document3.pdf")
+
+data_list = [pdf_data_1, pdf_data_2, pdf_data_3]
+
+# Option 1: Use single MIME type for all inputs
+{:ok, results} = Kreuzberg.batch_extract_bytes(data_list, "application/pdf")
+
+# Process results
+Enum.each(results, fn result ->
+  IO.puts("Content length: #{byte_size(result.content)} characters")
+  IO.puts("MIME type: #{result.mime_type}")
+  IO.puts("Tables found: #{length(result.tables)}")
+  IO.puts("---")
+end)
+
+IO.puts("Total documents processed: #{length(results)}")
+
+# Option 2: Use different MIME types for each input
+mime_types = ["application/pdf", "text/html", "application/pdf"]
+{:ok, mixed_results} = Kreuzberg.batch_extract_bytes(data_list, mime_types)
+
+# Option 3: Batch extraction with configuration
+config = %Kreuzberg.ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "tesseract"},
+  extract_images: true
+}
+
+case Kreuzberg.batch_extract_bytes(data_list, "application/pdf", config) do
+  {:ok, results} ->
+    IO.puts("Successfully extracted #{length(results)} documents")
+    Enum.each(results, fn result ->
+      IO.puts("Content: #{String.slice(result.content, 0..100)}...")
+    end)
+
+  {:error, reason} ->
+    IO.puts("Batch extraction failed: #{reason}")
+end
+
+# Option 4: Using the bang variant (raises on error)
+try do
+  results = Kreuzberg.batch_extract_bytes!(data_list, "application/pdf")
+  IO.puts("Extracted #{length(results)} documents successfully")
+rescue
+  error in Kreuzberg.Error ->
+    IO.puts("Error: #{error.message}")
+end
+```
--- a/docs/snippets/elixir/core/batch_extract_files_sync.exs
+++ b/docs/snippets/elixir/core/batch_extract_files_sync.exs
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
+
+{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
+
+Enum.each(results, fn result ->
+  IO.puts("File: #{result.mime_type}")
+  IO.puts("Content length: #{byte_size(result.content)} characters")
+  IO.puts("Tables: #{length(result.tables)}")
+  IO.puts("---")
+end)
+
+IO.puts("Total files processed: #{length(results)}")
+```
--- a/docs/snippets/elixir/core/client_extract_multiple_files.exs
+++ b/docs/snippets/elixir/core/client_extract_multiple_files.exs
@@ -0,0 +1,163 @@
+```elixir title="Elixir"
+# Client wrapper for batch file extraction
+# Provides error handling, logging, and result aggregation
+
+defmodule BatchDocumentClient do
+  @moduledoc """
+  Client wrapper for batch file document extraction.
+  Handles multiple files with comprehensive error handling and logging.
+  """
+
+  alias Kreuzberg.ExtractionResult
+
+  @doc """
+  Extract content from multiple files in batch.
+
+  Returns results for all successfully processed files and logs errors
+  for any files that fail during extraction.
+
+  ## Options
+
+    * `:mime_type` - MIME type for all files (optional, defaults to auto-detection)
+    * `:config` - ExtractionConfig struct with options (optional)
+    * `:log_errors` - Whether to log extraction errors (default: true)
+    * `:fail_fast` - Stop on first error (default: false)
+
+  ## Examples
+
+      {:ok, results} = BatchDocumentClient.extract_files(
+        ["doc1.pdf", "doc2.pdf", "doc3.pdf"],
+        mime_type: "application/pdf"
+      )
+  """
+  @spec extract_files([String.t()], keyword()) ::
+          {:ok, [ExtractionResult.t()]} | {:error, String.t()}
+  def extract_files(paths, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+    log_errors = Keyword.get(opts, :log_errors, true)
+
+    case Kreuzberg.batch_extract_files(paths, mime_type, config) do
+      {:ok, results} ->
+        IO.debug("Successfully extracted #{length(results)} files")
+        {:ok, results}
+
+      {:error, reason} ->
+        if log_errors do
+          IO.debug("Batch extraction error: #{reason}")
+        end
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract files and return detailed statistics.
+
+  Returns aggregated metrics about all processed files.
+  """
+  @spec extract_files_with_stats([String.t()], keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_files_with_stats(paths, opts \\ []) do
+    start_time = System.monotonic_time(:millisecond)
+
+    case extract_files(paths, opts) do
+      {:ok, results} ->
+        elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+        stats = %{
+          total_files: length(results),
+          total_content_size: Enum.reduce(results, 0, &(byte_size(&1.content) + &2)),
+          total_tables: Enum.reduce(results, 0, &(length(&1.tables) + &2)),
+          total_images: Enum.reduce(results, 0, &(length(&1.images || []) + &2)),
+          processing_time_ms: elapsed_ms,
+          avg_time_per_file_ms: div(elapsed_ms, max(length(results), 1)),
+          results: results
+        }
+
+        {:ok, stats}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract files and transform results.
+
+  Applies a transformation function to each extraction result.
+  Useful for custom processing or formatting of results.
+  """
+  @spec extract_and_transform([String.t()], function(), keyword()) ::
+          {:ok, [any()]} | {:error, String.t()}
+  def extract_and_transform(paths, transform_fn, opts \\ []) do
+    case extract_files(paths, opts) do
+      {:ok, results} ->
+        transformed =
+          results
+          |> Enum.map(fn result ->
+            try do
+              {:ok, transform_fn.(result)}
+            rescue
+              error ->
+                IO.debug("Transform error: #{inspect(error)}")
+                {:error, error}
+            end
+          end)
+
+        # Check if any transforms failed
+        case Enum.find(transformed, fn r -> match?({:error, _}, r) end) do
+          nil ->
+            # All succeeded
+            {:ok, Enum.map(transformed, fn {:ok, value} -> value end)}
+
+          {:error, error} ->
+            {:error, "Transform failed: #{inspect(error)}"}
+        end
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+
+# Extract multiple files
+case BatchDocumentClient.extract_files(["doc1.pdf", "doc2.pdf", "doc3.pdf"]) do
+  {:ok, results} ->
+    Enum.each(results, fn result ->
+      IO.puts("Extracted: #{byte_size(result.content)} bytes")
+    end)
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with statistics
+case BatchDocumentClient.extract_files_with_stats(["doc1.pdf", "doc2.pdf"]) do
+  {:ok, stats} ->
+    IO.puts("Total files: #{stats.total_files}")
+    IO.puts("Total size: #{stats.total_content_size} bytes")
+    IO.puts("Processing time: #{stats.processing_time_ms}ms")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract and transform
+transform = fn result ->
+  %{
+    mime: result.mime_type,
+    size: byte_size(result.content),
+    tables: length(result.tables)
+  }
+end
+
+case BatchDocumentClient.extract_and_transform(["doc1.pdf", "doc2.pdf"], transform) do
+  {:ok, transformed_results} ->
+    IO.inspect(transformed_results)
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/client_extract_single_file.exs
+++ b/docs/snippets/elixir/core/client_extract_single_file.exs
@@ -0,0 +1,107 @@
+```elixir title="Elixir"
+# Reusable client pattern for single file extraction
+# Encapsulates extraction logic with consistent error handling
+
+defmodule DocumentClient do
+  @moduledoc """
+  Client wrapper for single file document extraction.
+  Provides a consistent interface for extracting content from files.
+  """
+
+  alias Kreuzberg.ExtractionResult
+
+  @doc """
+  Extract content from a single file.
+
+  Returns a result tuple with the extraction outcome.
+  Supports explicit MIME type specification or auto-detection.
+
+  ## Options
+
+    * `:mime_type` - MIME type of the file (optional, defaults to auto-detection)
+    * `:config` - ExtractionConfig struct with options (optional)
+
+  ## Examples
+
+      {:ok, result} = DocumentClient.extract_file("document.pdf")
+      {:ok, result} = DocumentClient.extract_file("document.pdf", mime_type: "application/pdf")
+  """
+  @spec extract_file(String.t(), keyword()) ::
+          {:ok, ExtractionResult.t()} | {:error, String.t()}
+  def extract_file(path, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        IO.debug("Successfully extracted file: #{path}")
+        {:ok, result}
+
+      {:error, reason} ->
+        IO.debug("Failed to extract file: #{path} - #{reason}")
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract content from a file, raising on error.
+
+  Raises Kreuzberg.Error if extraction fails.
+  """
+  @spec extract_file!(String.t(), keyword()) :: ExtractionResult.t()
+  def extract_file!(path, opts \\ []) do
+    mime_type = Keyword.get(opts, :mime_type, nil)
+    config = Keyword.get(opts, :config, nil)
+
+    Kreuzberg.extract_file!(path, mime_type, config)
+  end
+
+  @doc """
+  Extract and process file content.
+
+  Returns a map with extracted content, metadata, and processing statistics.
+  """
+  @spec extract_with_stats(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_stats(path, opts \\ []) do
+    start_time = System.monotonic_time(:millisecond)
+
+    case extract_file(path, opts) do
+      {:ok, result} ->
+        elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+        {:ok,
+         %{
+           content: result.content,
+           mime_type: result.mime_type,
+           metadata: result.metadata,
+           table_count: length(result.tables),
+           image_count: length(result.images || []),
+           processing_time_ms: elapsed_ms
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+case DocumentClient.extract_file("document.pdf") do
+  {:ok, result} ->
+    IO.puts("Content length: #{byte_size(result.content)} bytes")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+# Extract with statistics
+case DocumentClient.extract_with_stats("document.pdf") do
+  {:ok, stats} ->
+    IO.puts("Processing time: #{stats.processing_time_ms}ms")
+    IO.puts("Tables found: #{stats.table_count}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/client_extract_with_config.exs
+++ b/docs/snippets/elixir/core/client_extract_with_config.exs
@@ -0,0 +1,247 @@
+```elixir title="Elixir"
+# Client wrapper with comprehensive configuration
+# Demonstrates advanced extraction patterns with OCR, chunking, and validation
+
+defmodule ConfiguredDocumentClient do
+  @moduledoc """
+  Client wrapper for document extraction with advanced configuration.
+  Supports OCR, chunking, language detection, and custom error handling.
+  """
+
+  alias Kreuzberg.{ExtractionConfig, ExtractionResult}
+
+  @doc """
+  Extract with OCR enabled for scanned documents.
+
+  Uses Tesseract OCR backend for text extraction from images.
+  """
+  @spec extract_with_ocr(String.t() | binary(), keyword()) ::
+          {:ok, ExtractionResult.t()} | {:error, String.t()}
+  def extract_with_ocr(input, opts \\ []) do
+    is_file = is_binary(input) and File.exists?(input)
+
+    config = %ExtractionConfig{
+      ocr: %{
+        "enabled" => true,
+        "backend" => Keyword.get(opts, :ocr_backend, "tesseract")
+      },
+      force_ocr: Keyword.get(opts, :force_ocr, false)
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case is_file do
+      true -> Kreuzberg.extract_file(input, mime_type, config)
+      false -> Kreuzberg.extract(input, mime_type || "application/pdf", config)
+    end
+  end
+
+  @doc """
+  Extract with text chunking for embedding or RAG pipelines.
+
+  Splits extracted text into chunks with configurable size and overlap.
+  """
+  @spec extract_with_chunking(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_chunking(path, opts \\ []) do
+    config = %ExtractionConfig{
+      chunking: %{
+        "max_characters" => Keyword.get(opts, :chunk_size, 1000),
+        "overlap" => Keyword.get(opts, :chunk_overlap, 100)
+      }
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        chunks = result.chunks || []
+
+        {:ok,
+         %{
+           content: result.content,
+           chunks: chunks,
+           chunk_count: length(chunks),
+           metadata: result.metadata
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract with language detection and multi-language support.
+
+  Detects document languages and can extract from specific languages.
+  """
+  @spec extract_with_language_detection(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_language_detection(path, opts \\ []) do
+    config = %ExtractionConfig{
+      language_detection: %{"enabled" => true},
+      extract_images: true
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        {:ok,
+         %{
+           content: result.content,
+           detected_languages: result.detected_languages || [],
+           mime_type: result.mime_type,
+           tables: result.tables,
+           images: result.images || []
+         }}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Extract with comprehensive configuration for production use.
+
+  Combines OCR, chunking, caching, and language detection with error handling.
+  """
+  @spec extract_with_full_config(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_with_full_config(path, opts \\ []) do
+    config = %ExtractionConfig{
+      # OCR settings
+      ocr: %{
+        "enabled" => Keyword.get(opts, :ocr_enabled, true),
+        "backend" => Keyword.get(opts, :ocr_backend, "tesseract")
+      },
+      # Chunking for embeddings
+      chunking: %{
+        "max_characters" => Keyword.get(opts, :chunk_size, 1000),
+        "overlap" => Keyword.get(opts, :chunk_overlap, 100)
+      },
+      # Language detection
+      language_detection: %{"enabled" => Keyword.get(opts, :detect_language, true)},
+      # Cache results
+      use_cache: Keyword.get(opts, :use_cache, true),
+      # Extract various content types
+      extract_images: Keyword.get(opts, :extract_images, true),
+      extract_tables: true
+    }
+
+    mime_type = Keyword.get(opts, :mime_type, nil)
+
+    case Kreuzberg.extract_file(path, mime_type, config) do
+      {:ok, result} ->
+        summary = %{
+          file_path: path,
+          mime_type: result.mime_type,
+          content_length: byte_size(result.content),
+          content_preview: String.slice(result.content, 0..200),
+          detected_languages: result.detected_languages || [],
+          table_count: length(result.tables),
+          image_count: length(result.images || []),
+          chunk_count: length(result.chunks || []),
+          metadata: result.metadata
+        }
+
+        {:ok, summary}
+
+      {:error, reason} ->
+        {:error, "Extraction failed: #{reason}"}
+    end
+  end
+
+  @doc """
+  Validate file before extraction.
+
+  Checks file existence and MIME type compatibility.
+  """
+  @spec validate_file(String.t()) :: :ok | {:error, String.t()}
+  def validate_file(path) do
+    cond do
+      not File.exists?(path) ->
+        {:error, "File not found: #{path}"}
+
+      true ->
+        case Kreuzberg.detect_mime_type_from_path(path) do
+          {:ok, _mime_type} ->
+            :ok
+
+          {:error, reason} ->
+            {:error, "Cannot determine MIME type: #{reason}"}
+        end
+    end
+  end
+
+  @doc """
+  Extract with validation and error recovery.
+
+  Validates file before extraction and provides detailed error information.
+  """
+  @spec extract_safely(String.t(), keyword()) ::
+          {:ok, map()} | {:error, String.t()}
+  def extract_safely(path, opts \\ []) do
+    with :ok <- validate_file(path),
+         {:ok, summary} <- extract_with_full_config(path, opts) do
+      {:ok, summary}
+    else
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+end
+
+# Usage examples
+
+# Extract with OCR
+case ConfiguredDocumentClient.extract_with_ocr("scanned_document.pdf",
+  ocr_backend: "tesseract"
+) do
+  {:ok, result} ->
+    IO.puts("OCR extraction successful")
+    IO.puts("Content: #{String.slice(result.content, 0..100)}...")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with chunking
+case ConfiguredDocumentClient.extract_with_chunking("document.pdf",
+  chunk_size: 500,
+  chunk_overlap: 50
+) do
+  {:ok, data} ->
+    IO.puts("Chunks: #{data.chunk_count}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Extract with language detection
+case ConfiguredDocumentClient.extract_with_language_detection("multilingual.pdf") do
+  {:ok, data} ->
+    IO.puts("Detected languages: #{inspect(data.detected_languages)}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+
+# Comprehensive extraction with validation
+case ConfiguredDocumentClient.extract_safely("document.pdf",
+  ocr_enabled: true,
+  detect_language: true,
+  extract_images: true,
+  use_cache: true
+) do
+  {:ok, summary} ->
+    IO.puts("File: #{summary.file_path}")
+    IO.puts("MIME: #{summary.mime_type}")
+    IO.puts("Size: #{summary.content_length} bytes")
+    IO.puts("Tables: #{summary.table_count}")
+    IO.puts("Languages: #{inspect(summary.detected_languages)}")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}")
+end
+```
--- a/docs/snippets/elixir/core/error_handling.exs
+++ b/docs/snippets/elixir/core/error_handling.exs
@@ -0,0 +1,31 @@
+```elixir title="Elixir"
+# Example: Handling extraction errors
+case Kreuzberg.extract_file("document.pdf") do
+  {:ok, result} ->
+    IO.puts("Successfully extracted content")
+    IO.puts("Content length: #{byte_size(result.content)} characters")
+
+  {:error, reason} ->
+    IO.puts("Extraction failed: #{reason}")
+end
+
+# Example: Handling with custom error message
+result = Kreuzberg.extract_file("nonexistent.pdf")
+
+case result do
+  {:ok, data} ->
+    IO.puts("File processed successfully")
+  {:error, error} ->
+    IO.puts("Error details: #{inspect(error)}")
+end
+
+# Example: Extract with pattern matching
+case Kreuzberg.extract(<<>>, "application/pdf") do
+  {:ok, result} ->
+    IO.puts("Content: #{result.content}")
+  {:error, msg} when is_binary(msg) ->
+    IO.puts("Validation error: #{msg}")
+  {:error, reason} ->
+    IO.puts("Unknown error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/core/extract_bytes_sync.exs
+++ b/docs/snippets/elixir/core/extract_bytes_sync.exs
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+# Read file into memory
+{:ok, file_content} = File.read("document.pdf")
+
+# Extract from bytes/binary data
+{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
+
+content = result.content
+IO.puts("Extracted content:")
+IO.puts(content)
+IO.puts("MIME type: #{result.mime_type}")
+IO.puts("Tables found: #{length(result.tables)}")
+```
--- a/docs/snippets/elixir/core/extract_file_async.exs
+++ b/docs/snippets/elixir/core/extract_file_async.exs
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+task = Kreuzberg.extract_file_async("document.pdf")
+{:ok, result} = Task.await(task)
+
+content = result.content
+table_count = length(result.tables)
+metadata = result.metadata
+
+IO.puts("Content length: #{byte_size(content)} characters")
+IO.puts("Tables: #{table_count}")
+IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
+```
--- a/docs/snippets/elixir/core/extract_file_sync.exs
+++ b/docs/snippets/elixir/core/extract_file_sync.exs
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+
+content = result.content
+table_count = length(result.tables)
+metadata = result.metadata
+
+IO.puts("Content length: #{byte_size(content)} characters")
+IO.puts("Tables: #{table_count}")
+IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
+```