Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/ocr/ocr_force_all_pages.exs
+++ b/docs/snippets/elixir/ocr/ocr_force_all_pages.exs
@@ -0,0 +1,96 @@
+```elixir title="Elixir"
+# Force OCR processing on all pages of a document
+# By default, OCR is only applied when needed (scanned content detected)
+# Use force_all_pages to ensure OCR runs on every page regardless
+
+alias Kreuzberg.ExtractionConfig
+
+# Configuration with OCR forced on all pages
+config = %ExtractionConfig{
+  ocr: %{
+    "enabled" => true,
+    "backend" => "tesseract",
+    "language" => "eng",
+    # Force OCR to run on every page, not just scanned content
+    "force_all_pages" => true
+  },
+  chunking: %{
+    "enabled" => true,
+    "max_characters" => 1500,
+    "overlap" => 150
+  },
+  language_detection: %{
+    "enabled" => true
+  },
+  use_cache: true
+}
+
+IO.puts("Starting OCR extraction with force_all_pages enabled...")
+start_time = System.monotonic_time(:millisecond)
+
+case Kreuzberg.extract_file("mixed_document.pdf", nil, config) do
+  {:ok, result} ->
+    elapsed_ms = System.monotonic_time(:millisecond) - start_time
+
+    IO.puts("\n=== OCR Extraction Complete ===\n")
+
+    # Show timing information
+    IO.puts("Processing time: #{elapsed_ms}ms")
+    IO.puts("Content extracted: #{byte_size(result.content)} bytes")
+
+    # Metadata shows OCR was performed
+    metadata = result.metadata || %{}
+    if metadata["ocr_applied"] do
+      IO.puts("OCR applied to all pages: Yes")
+    end
+
+    # Show language detection results
+    languages = result.detected_languages || []
+    IO.puts("\nDetected languages (#{length(languages)}):")
+    Enum.each(languages, fn lang ->
+      IO.puts("  - #{lang}")
+    end)
+
+    # Show chunking results (useful for RAG/search)
+    chunks = result.chunks || []
+    IO.puts("\nChunks created: #{length(chunks)}")
+    avg_chunk_size = if Enum.empty?(chunks) do
+      0
+    else
+      total_size = Enum.reduce(chunks, 0, &(byte_size(&1) + &2))
+      div(total_size, length(chunks))
+    end
+    IO.puts("Average chunk size: #{avg_chunk_size} bytes")
+
+    # Display content preview
+    IO.puts("\nContent preview (first 300 characters):")
+    preview = String.slice(result.content, 0..299)
+    IO.puts(preview)
+    IO.puts("...\n")
+
+    # Show any extracted tables
+    tables = result.tables || []
+    if not Enum.empty?(tables) do
+      IO.puts("Tables found: #{length(tables)}")
+      Enum.with_index(tables, 1) |> Enum.each(fn {table, idx} ->
+        cells = table["cells"] || []
+        IO.puts("  Table #{idx}: #{length(cells)} rows")
+      end)
+      IO.puts("")
+    end
+
+    # Show any extracted images
+    images = result.images || []
+    if not Enum.empty?(images) do
+      IO.puts("Images extracted: #{length(images)}")
+      Enum.with_index(images, 1) |> Enum.each(fn {image, idx} ->
+        IO.puts("  Image #{idx}: #{image["format"]} - #{image["size"]} bytes")
+      end)
+    end
+
+  {:error, reason} ->
+    elapsed_ms = System.monotonic_time(:millisecond) - start_time
+    IO.puts("OCR extraction failed after #{elapsed_ms}ms")
+    IO.puts("Error: #{inspect(reason)}")
+end
+```