Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/elixir/ocr/cloud_ocr_backend.md
@@ -0,0 +1,39 @@
+```elixir title="Elixir"
+defmodule MyApp.CloudOcrBackend do
+  @behaviour Kreuzberg.Plugin
+
+  defstruct api_key: nil, supported_langs: []
+
+  def new(api_key, supported_langs) do
+    %__MODULE__{api_key: api_key, supported_langs: supported_langs}
+  end
+
+  @impl Kreuzberg.Plugin
+  def name(_backend), do: "cloud-ocr"
+
+  @impl Kreuzberg.Plugin
+  def version(_backend), do: "1.0.0"
+
+  @impl Kreuzberg.Plugin
+  def initialize(_backend), do: :ok
+
+  @impl Kreuzberg.Plugin
+  def shutdown(_backend), do: :ok
+
+  def process_image(backend, image_bytes, language) do
+    call_cloud_api(backend, image_bytes, language)
+  end
+
+  def supports_language(backend, lang) do
+    Enum.member?(backend.supported_langs, lang)
+  end
+
+  defp call_cloud_api(_backend, _image, _language) do
+    {:ok, "Extracted text"}
+  end
+end
+
+# Register the custom backend
+backend = MyApp.CloudOcrBackend.new("api-key", ["en", "de", "fr"])
+# Use with Kreuzberg extraction...
+```
--- a/docs/snippets/elixir/ocr/image_extraction.exs
+++ b/docs/snippets/elixir/ocr/image_extraction.exs
@@ -0,0 +1,133 @@
+```elixir title="Elixir"
+# Extract images from documents for further processing
+# Images are returned as base64-encoded data with format and size information
+
+alias Kreuzberg.ExtractionConfig
+
+defmodule ImageExtractor do
+  @moduledoc """
+  Extract and process images from documents.
+  Provides utilities for working with extracted image data.
+  """
+
+  @doc """
+  Extract all images from a document.
+
+  Returns a list of image data with format and metadata.
+  """
+  def extract_images(file_path) do
+    config = %ExtractionConfig{
+      # Enable image extraction
+      images: %{
+        "extract" => true
+      },
+      use_cache: true
+    }
+
+    case Kreuzberg.extract_file(file_path, nil, config) do
+      {:ok, result} ->
+        {:ok, result.images || []}
+
+      {:error, reason} ->
+        {:error, reason}
+    end
+  end
+
+  @doc """
+  Save extracted images to disk.
+
+  Takes extracted images and writes them to individual files.
+  """
+  def save_images(images, output_dir) do
+    File.mkdir_p!(output_dir)
+
+    Enum.with_index(images, 1) |> Enum.map(fn {image, idx} ->
+      format = image["format"] || "png"
+      filename = "#{output_dir}/image_#{idx}.#{format}"
+
+      # Decode base64 image data
+      case Base.decode64(image["data"]) do
+        {:ok, binary_data} ->
+          File.write!(filename, binary_data)
+          {:ok, filename}
+
+        :error ->
+          {:error, "Failed to decode image #{idx}"}
+      end
+    end)
+  end
+
+  @doc """
+  Get image statistics from extracted images.
+  """
+  def get_image_stats(images) do
+    {count, total_size, formats} = Enum.reduce(images, {0, 0, %{}}, fn image, {count, size, formats} ->
+      new_count = count + 1
+      new_size = size + (image["size"] || 0)
+      format = image["format"] || "unknown"
+      new_formats = Map.update(formats, format, 1, &(&1 + 1))
+
+      {new_count, new_size, new_formats}
+    end)
+
+    %{
+      total_images: count,
+      total_bytes: total_size,
+      formats: formats,
+      avg_size: if(count > 0, do: div(total_size, count), else: 0)
+    }
+  end
+end
+
+# Usage example
+file_path = "document_with_images.pdf"
+
+IO.puts("Extracting images from: #{file_path}\n")
+
+case ImageExtractor.extract_images(file_path) do
+  {:ok, images} ->
+    IO.puts("Found #{length(images)} image(s)\n")
+
+    # Get image statistics
+    stats = ImageExtractor.get_image_stats(images)
+    IO.puts("=== Image Statistics ===")
+    IO.puts("Total images: #{stats.total_images}")
+    IO.puts("Total size: #{stats.total_bytes} bytes (#{div(stats.total_bytes, 1024)} KB)")
+    IO.puts("Average size: #{stats.avg_size} bytes")
+    IO.puts("Formats: #{inspect(stats.formats)}")
+    IO.puts("")
+
+    # Display individual image information
+    IO.puts("=== Individual Images ===")
+    Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
+      IO.puts("Image #{idx}:")
+      IO.puts("  Format: #{image["format"]}")
+      IO.puts("  Size: #{image["size"]} bytes")
+
+      # Optional: show dimensions if available
+      if image["width"] && image["height"] do
+        IO.puts("  Dimensions: #{image["width"]}x#{image["height"]} pixels")
+      end
+
+      # Optional: show DPI if available
+      if image["dpi"] do
+        IO.puts("  DPI: #{image["dpi"]}")
+      end
+
+      IO.puts("")
+    end)
+
+    # Save images to disk
+    case ImageExtractor.save_images(images, "/tmp/extracted_images") do
+      results ->
+        successful = Enum.count(results, fn
+          {:ok, _path} -> true
+          _ -> false
+        end)
+        IO.puts("Saved #{successful}/#{length(results)} images to /tmp/extracted_images")
+    end
+
+  {:error, reason} ->
+    IO.puts("Error extracting images: #{reason}")
+end
+```
--- a/docs/snippets/elixir/ocr/image_extraction.md
+++ b/docs/snippets/elixir/ocr/image_extraction.md
@@ -0,0 +1,17 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "images" => %{
+    "extract_images" => true,
+    "target_dpi" => 200,
+    "max_image_dimension" => 2048,
+    "inject_placeholders" => true,
+    "auto_adjust_dpi" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Content length: #{String.length(result.content)}")
+if result.images do
+  IO.puts("Images extracted: #{length(result.images)}")
+end
+```
--- a/docs/snippets/elixir/ocr/image_preprocessing.exs
+++ b/docs/snippets/elixir/ocr/image_preprocessing.exs
@@ -0,0 +1,174 @@
+```elixir title="Elixir"
+# Preprocess images before OCR for improved text extraction
+# Image preprocessing can enhance OCR accuracy on poor quality scans
+
+alias Kreuzberg.ExtractionConfig
+
+defmodule ImagePreprocessor do
+  @moduledoc """
+  Preprocess images for optimal OCR results.
+  Provides utilities for image enhancement before text extraction.
+  """
+
+  @doc """
+  Extract with image preprocessing enabled.
+
+  Preprocessing improves OCR accuracy on documents with:
+  - Low contrast text
+  - Skewed pages
+  - Poor image quality
+  - Faded or faint text
+  """
+  def extract_with_preprocessing(file_path, opts \\ []) do
+    # Get preprocessing options
+    deskew = Keyword.get(opts, :deskew, true)
+    denoise = Keyword.get(opts, :denoise, true)
+    binarize = Keyword.get(opts, :binarize, false)
+    brightness_threshold = Keyword.get(opts, :brightness_threshold, 50)
+
+    config = %ExtractionConfig{
+      ocr: %{
+        "enabled" => true,
+        "backend" => "tesseract",
+        "language" => "eng",
+        # Enable preprocessing for better OCR results
+        "preprocessing" => %{
+          "enabled" => true,
+          "deskew" => deskew,
+          "denoise" => denoise,
+          "binarize" => binarize,
+          "brightness_threshold" => brightness_threshold
+        }
+      },
+      chunking: %{
+        "enabled" => true,
+        "max_characters" => 1500,
+        "overlap" => 150
+      },
+      language_detection: %{
+        "enabled" => true
+      },
+      use_cache: false  # Disable caching for preprocessing runs
+    }
+
+    Kreuzberg.extract_file(file_path, nil, config)
+  end
+
+  @doc """
+  Extract with aggressive preprocessing for poor quality documents.
+
+  Use this for heavily degraded or difficult-to-read documents.
+  """
+  def extract_with_aggressive_preprocessing(file_path) do
+    extract_with_preprocessing(file_path,
+      deskew: true,
+      denoise: true,
+      binarize: true,
+      brightness_threshold: 75
+    )
+  end
+
+  @doc """
+  Compare extraction quality with and without preprocessing.
+
+  Useful for determining optimal preprocessing settings.
+  """
+  def compare_preprocessing_quality(file_path) do
+    IO.puts("Comparing preprocessing options...\n")
+
+    # Extract without preprocessing
+    IO.puts("Extracting without preprocessing...")
+    config_standard = %ExtractionConfig{
+      ocr: %{
+        "enabled" => true,
+        "backend" => "tesseract",
+        "language" => "eng",
+        "preprocessing" => %{"enabled" => false}
+      },
+      use_cache: false
+    }
+
+    {:ok, result_standard} = Kreuzberg.extract_file(file_path, nil, config_standard)
+
+    # Extract with preprocessing
+    IO.puts("Extracting with preprocessing...")
+    {:ok, result_preprocessed} = extract_with_aggressive_preprocessing(file_path)
+
+    # Compare results
+    standard_size = byte_size(result_standard.content)
+    preprocessed_size = byte_size(result_preprocessed.content)
+
+    IO.puts("\n=== Preprocessing Comparison ===")
+    IO.puts("Standard extraction: #{standard_size} bytes")
+    IO.puts("Preprocessed extraction: #{preprocessed_size} bytes")
+    IO.puts("Size difference: #{abs(preprocessed_size - standard_size)} bytes")
+
+    # Compare chunk quality
+    standard_chunks = result_standard.chunks || []
+    preprocessed_chunks = result_preprocessed.chunks || []
+    IO.puts("\nStandard chunks: #{length(standard_chunks)}")
+    IO.puts("Preprocessed chunks: #{length(preprocessed_chunks)}")
+
+    # Show content comparison
+    IO.puts("\n=== Content Comparison ===")
+    IO.puts("Standard preview:")
+    IO.puts(String.slice(result_standard.content, 0..199))
+    IO.puts("\n...")
+
+    IO.puts("\nPreprocessed preview:")
+    IO.puts(String.slice(result_preprocessed.content, 0..199))
+    IO.puts("\n...")
+
+    # Return comparison data
+    %{
+      standard_content: result_standard.content,
+      preprocessed_content: result_preprocessed.content,
+      standard_size: standard_size,
+      preprocessed_size: preprocessed_size,
+      improvement: if standard_size > 0 do
+        Float.round((preprocessed_size - standard_size) / standard_size * 100, 2)
+      else
+        0
+      end
+    }
+  end
+end
+
+# Usage examples
+
+# Example 1: Standard preprocessing
+file_path = "scanned_document.pdf"
+
+IO.puts("Example 1: Standard Extraction with Preprocessing\n")
+case ImagePreprocessor.extract_with_preprocessing(file_path) do
+  {:ok, result} ->
+    IO.puts("Extraction successful!")
+    IO.puts("Content length: #{byte_size(result.content)} bytes")
+    IO.puts("Chunks created: #{length(result.chunks || [])}")
+    IO.puts("Preview: #{String.slice(result.content, 0..100)}...\n")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}\n")
+end
+
+# Example 2: Aggressive preprocessing for difficult documents
+IO.puts("Example 2: Aggressive Preprocessing for Poor Quality\n")
+case ImagePreprocessor.extract_with_aggressive_preprocessing(file_path) do
+  {:ok, result} ->
+    IO.puts("Aggressive preprocessing extraction successful!")
+    IO.puts("Content length: #{byte_size(result.content)} bytes")
+
+  {:error, reason} ->
+    IO.puts("Error: #{reason}\n")
+end
+
+# Example 3: Compare preprocessing options
+IO.puts("Example 3: Compare Preprocessing Quality\n")
+try do
+  comparison = ImagePreprocessor.compare_preprocessing_quality(file_path)
+  IO.puts("\nImprovement with preprocessing: #{comparison.improvement}%")
+rescue
+  error ->
+    IO.puts("Comparison completed with notice: #{inspect(error)}")
+end
+```
--- a/docs/snippets/elixir/ocr/image_preprocessing.md
+++ b/docs/snippets/elixir/ocr/image_preprocessing.md
@@ -0,0 +1,19 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "tesseract_config" => %{
+      "preprocessing" => %{
+        "target_dpi" => 300,
+        "denoise" => true,
+        "deskew" => true,
+        "contrast_enhance" => true,
+        "binarization_method" => "otsu"
+      }
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/ocr/ocr_easyocr.md
+++ b/docs/snippets/elixir/ocr/ocr_easyocr.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "easyocr",
+    "language" => "en"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_async("document.pdf", "application/pdf", config_json)
+IO.puts("Extracted text: #{result.content}")
+```
--- a/docs/snippets/elixir/ocr/ocr_elements.md
+++ b/docs/snippets/elixir/ocr/ocr_elements.md
@@ -0,0 +1,21 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "paddle-ocr", "language" => "en"}
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned.pdf", nil, config)
+
+for element <- result.ocr_elements || [] do
+  IO.puts("Text: #{element.text}")
+  IO.puts("Confidence: #{Float.round(element.confidence.recognition, 2)}")
+  IO.puts("Geometry: #{inspect(element.geometry)}")
+
+  if element.rotation do
+    IO.puts("Rotation: #{element.rotation.angle}°")
+  end
+
+  IO.puts("")
+end
+```
--- a/docs/snippets/elixir/ocr/ocr_extraction.exs
+++ b/docs/snippets/elixir/ocr/ocr_extraction.exs
@@ -0,0 +1,34 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Extract text from a scanned PDF using OCR
+# Tesseract processes the document and returns structured content
+
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract"
+  },
+  chunking: %{
+    "max_characters" => 1500,
+    "overlap" => 150
+  },
+  language_detection: %{"enabled" => true},
+  use_cache: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned_invoice.pdf", nil, config)
+
+# Process the extracted content
+content = result.content
+chunks = result.chunks || []
+metadata = result.metadata || %{}
+
+IO.puts("OCR Extraction Complete:")
+IO.puts("Content length: #{byte_size(content)} bytes")
+IO.puts("Number of chunks: #{length(chunks)}")
+IO.puts("Detected languages: #{inspect(result.detected_languages)}")
+IO.puts("Creation date: #{metadata["creation_date"] || "N/A"}")
+IO.puts("\nFirst 200 characters of extracted text:")
+IO.puts(String.slice(content, 0..199))
+```
--- a/docs/snippets/elixir/ocr/ocr_extraction.md
+++ b/docs/snippets/elixir/ocr/ocr_extraction.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/ocr/ocr_force_all_pages.exs
+++ b/docs/snippets/elixir/ocr/ocr_force_all_pages.exs
@@ -0,0 +1,96 @@
+```elixir title="Elixir"
+# Force OCR processing on all pages of a document
+# By default, OCR is only applied when needed (scanned content detected)
+# Use force_all_pages to ensure OCR runs on every page regardless
+
+alias Kreuzberg.ExtractionConfig
+
+# Configuration with OCR forced on all pages
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng",
+    # Force OCR to run on every page, not just scanned content
+    "force_all_pages" => true
+  },
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 1500,
+    "overlap" => 150
+  },
+  language_detection: %{
+    "enabled" => true
+  },
+  use_cache: true
+}
+
+IO.puts("Starting OCR extraction with force_all_pages enabled...")
+start_time = System.monotonic_time(:millisecond)
+
+case Kreuzberg.extract_file("mixed_document.pdf", nil, config) do
+  {:ok, result} ->
+    elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+    IO.puts("\n=== OCR Extraction Complete ===\n")
+
+    # Show timing information
+    IO.puts("Processing time: #{elapsed_ms}ms")
+    IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+
+    # Metadata shows OCR was performed
+    metadata = result.metadata || %{}
+    if metadata["ocr_applied"] do
+      IO.puts("OCR applied to all pages: Yes")
+    end
+
+    # Show language detection results
+    languages = result.detected_languages || []
+    IO.puts("\nDetected languages (#{length(languages)}):")
+    Enum.each(languages, fn lang ->
+      IO.puts("  - #{lang}")
+    end)
+
+    # Show chunking results (useful for RAG/search)
+    chunks = result.chunks || []
+    IO.puts("\nChunks created: #{length(chunks)}")
+    avg_chunk_size = if Enum.empty?(chunks) do
+      0
+    else
+      total_size = Enum.reduce(chunks, 0, &(byte_size(&1) + &2))
+      div(total_size, length(chunks))
+    end
+    IO.puts("Average chunk size: #{avg_chunk_size} bytes")
+
+    # Display content preview
+    IO.puts("\nContent preview (first 300 characters):")
+    preview = String.slice(result.content, 0..299)
+    IO.puts(preview)
+    IO.puts("...\n")
+
+    # Show any extracted tables
+    tables = result.tables || []
+    if not Enum.empty?(tables) do
+      IO.puts("Tables found: #{length(tables)}")
+      Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
+        cells = table["cells"] || []
+        IO.puts("  Table #{idx}: #{length(cells)} rows")
+      end)
+      IO.puts("")
+    end
+
+    # Show any extracted images
+    images = result.images || []
+    if not Enum.empty?(images) do
+      IO.puts("Images extracted: #{length(images)}")
+      Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
+        IO.puts("  Image #{idx}: #{image["format"]} - #{image["size"]} bytes")
+      end)
+    end
+
+  {:error, reason} ->
+    elapsed_ms = System.monotonic_time(:millisecond) - start_time
+    IO.puts("OCR extraction failed after #{elapsed_ms}ms")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```
--- a/docs/snippets/elixir/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/elixir/ocr/ocr_force_all_pages.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract"
+  },
+  "force_ocr" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/ocr/ocr_multi_language.exs
+++ b/docs/snippets/elixir/ocr/ocr_multi_language.exs
@@ -0,0 +1,39 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+# Perform OCR extraction with multiple languages
+# Tesseract can recognize text in multiple languages simultaneously
+# Combine language codes with '+' separator: "eng+fra+deu" for English, French, German
+
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng+fra+deu"
+  },
+  chunking: %{
+    "max_characters" => 2000,
+    "overlap" => 200
+  },
+  language_detection: %{"enabled" => true},
+  use_cache: true,
+  force_ocr: true
+}
+
+{:ok, result} = Kreuzberg.extract_file("multilingual_document.pdf", nil, config)
+
+# Results will contain text recognized in all specified languages
+IO.puts("Multi-language OCR Extraction:")
+IO.puts("Supported languages: English, French, German")
+IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+IO.puts("Detected languages: #{inspect(result.detected_languages)}")
+IO.puts("Chunks created: #{if result.chunks, do: length(result.chunks), else: 0}")
+IO.puts("\nExtracted content preview:")
+IO.puts(String.slice(result.content, 0..250))
+
+# Access metadata if available
+metadata = result.metadata || %{}
+IO.puts("\nMetadata:")
+IO.puts("Pages: #{metadata["pages"] || "Unknown"}")
+IO.puts("Format: #{metadata["format"] || "Unknown"}")
+```
--- a/docs/snippets/elixir/ocr/ocr_multi_language.md
+++ b/docs/snippets/elixir/ocr/ocr_multi_language.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng+deu+fra"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("multilingual.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/ocr/ocr_paddleocr.md
+++ b/docs/snippets/elixir/ocr/ocr_paddleocr.md
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "paddle-ocr", "language" => "en"}
+  # Add "model_tier" => "server" for max accuracy
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
+
+IO.puts("OCR Extracted content:")
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/ocr/tesseract_basic.exs
+++ b/docs/snippets/elixir/ocr/tesseract_basic.exs
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+alias Kreuzberg.ExtractionConfig
+
+config = %ExtractionConfig{
+  ocr: %{"enabled" => true, "backend" => "tesseract"}
+}
+
+{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
+
+content = result.content
+IO.puts("OCR Extracted content:")
+IO.puts(content)
+IO.puts("Metadata: #{inspect(result.metadata)}")
+```