Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/elixir/config/advanced_config.md
+++ b/docs/snippets/elixir/config/advanced_config.md
@@ -0,0 +1,53 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "enable_quality_processing" => true,
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng"
+  },
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "embedding" => %{
+      "model" => %{
+        "preset" => %{
+          "name" => "balanced"
+        }
+      },
+      "batch_size" => 32,
+      "normalize" => true,
+      "show_download_progress" => false
+    }
+  },
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.8,
+    "detect_multiple" => false
+  },
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.1,
+    "ngram_range" => [1, 3],
+    "language" => "en"
+  },
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_important_words" => true
+  },
+  "postprocessor" => %{
+    "enabled" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Content: #{result.content}")
+
+if result.detected_languages do
+  IO.puts("Languages: #{inspect(result.detected_languages)}")
+end
+
+chunks_count = if result.chunks, do: length(result.chunks), else: 0
+IO.puts("Chunks: #{chunks_count}")
+```
--- a/docs/snippets/elixir/config/chunking_config.md
+++ b/docs/snippets/elixir/config/chunking_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Chunks: #{length(result.chunks)}")
+
+Enum.each(result.chunks, fn chunk ->
+  IO.puts("Length: #{String.length(chunk.content)}")
+end)
+```
--- a/docs/snippets/elixir/config/config_basic.md
+++ b/docs/snippets/elixir/config/config_basic.md
@@ -0,0 +1,9 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "enable_quality_processing" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/config_discover.md
+++ b/docs/snippets/elixir/config/config_discover.md
@@ -0,0 +1,5 @@
+```elixir title="Elixir"
+# Extract with nil config to use discovered/default configuration
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", nil)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/config_ocr.md
+++ b/docs/snippets/elixir/config/config_ocr.md
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts("Content length: #{String.length(result.content)}")
+IO.puts("Tables detected: #{length(result.tables)}")
+```
--- a/docs/snippets/elixir/config/config_programmatic.md
+++ b/docs/snippets/elixir/config/config_programmatic.md
@@ -0,0 +1,20 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "use_cache" => true,
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng+deu",
+    "tesseract_config" => %{
+      "psm" => 6
+    }
+  },
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200
+  },
+  "enable_quality_processing" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Content length: #{String.length(result.content)}")
+```
--- a/docs/snippets/elixir/config/document_structure_config.md
+++ b/docs/snippets/elixir/config/document_structure_config.md
@@ -0,0 +1,13 @@
+```elixir title="Document Structure Config (Elixir)"
+config = %Kreuzberg.ExtractionConfig{
+  include_document_structure: true
+}
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
+
+if result.document do
+  Enum.each(result.document.nodes, fn node ->
+    IO.puts("[#{node.content.node_type}]")
+  end)
+end
+```
--- a/docs/snippets/elixir/config/element_based_output.md
+++ b/docs/snippets/elixir/config/element_based_output.md
@@ -0,0 +1,36 @@
+```elixir title="Element-Based Output (Elixir)"
+# Configure element-based output
+config = %Kreuzberg.ExtractionConfig{
+  output_format: :element_based
+}
+
+# Extract document
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", config)
+
+# Access elements
+Enum.each(result.elements, fn element ->
+  IO.puts("Type: #{element.element_type}")
+
+  text = String.slice(element.text, 0, 100)
+  IO.puts("Text: #{text}")
+
+  if element.metadata.page_number do
+    IO.puts("Page: #{element.metadata.page_number}")
+  end
+
+  if element.metadata.coordinates do
+    coords = element.metadata.coordinates
+    IO.puts("Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})")
+  end
+
+  IO.puts("---")
+end)
+
+# Filter by element type
+titles = Enum.filter(result.elements, fn e -> e.element_type == :title end)
+
+Enum.each(titles, fn title ->
+  level = Map.get(title.metadata.additional, "level", "unknown")
+  IO.puts("[#{level}] #{title.text}")
+end)
+```
--- a/docs/snippets/elixir/config/embedding_config.md
+++ b/docs/snippets/elixir/config/embedding_config.md
@@ -0,0 +1,22 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "chunking" => %{
+    "max_characters" => 1000,
+    "overlap" => 200,
+    "embedding" => %{
+      "model" => %{
+        "preset" => %{
+          "name" => "balanced"
+        }
+      },
+      "batch_size" => 16,
+      "normalize" => true,
+      "show_download_progress" => true
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+chunks_count = if result.chunks, do: length(result.chunks), else: 0
+IO.puts("Chunks with embeddings: #{chunks_count}")
+```
--- a/docs/snippets/elixir/config/html_output.md
+++ b/docs/snippets/elixir/config/html_output.md
@@ -0,0 +1,11 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "output_format" => "Html",
+  "html_output" => %{
+    "theme" => "GitHub"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts(result.content)
+```
--- a/docs/snippets/elixir/config/keyword_extraction_config.md
+++ b/docs/snippets/elixir/config/keyword_extraction_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "keywords" => %{
+    "algorithm" => "Yake",
+    "max_keywords" => 10,
+    "min_score" => 0.1,
+    "ngram_range" => [1, 3],
+    "language" => "en"
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Keywords: #{inspect(result.keywords)}")
+```
--- a/docs/snippets/elixir/config/language_detection_config.md
+++ b/docs/snippets/elixir/config/language_detection_config.md
@@ -0,0 +1,13 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "language_detection" => %{
+    "enabled" => true,
+    "min_confidence" => 0.8,
+    "detect_multiple" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Detected language: #{result.language}")
+IO.puts("Confidence: #{result.language_confidence}")
+```
--- a/docs/snippets/elixir/config/ocr_dpi_config.md
+++ b/docs/snippets/elixir/config/ocr_dpi_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "images" => %{
+    "extract_images" => true,
+    "target_dpi" => 300,
+    "max_image_dimension" => 4096,
+    "auto_adjust_dpi" => true,
+    "min_dpi" => 150,
+    "max_dpi" => 600
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Extracted images: #{length(result.images)}")
+```
--- a/docs/snippets/elixir/config/pdf_config.md
+++ b/docs/snippets/elixir/config/pdf_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "pdf_options" => %{
+    "extract_images" => true,
+    "passwords" => ["password123"],
+    "extract_metadata" => true,
+    "hierarchy" => %{}
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("encrypted.pdf", "application/pdf", config_json)
+IO.puts("Title: #{inspect(result.metadata.title)}")
+IO.puts("Authors: #{inspect(result.metadata.authors)}")
+```
--- a/docs/snippets/elixir/config/pdf_hierarchy_config.md
+++ b/docs/snippets/elixir/config/pdf_hierarchy_config.md
@@ -0,0 +1,16 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "pdf_options" => %{
+    "hierarchy" => %{
+      "enabled" => true,
+      "detection_threshold" => 0.75,
+      "ocr_coverage_threshold" => 0.8,
+      "min_level" => 1,
+      "max_level" => 5
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Hierarchy levels: #{length(result.hierarchy)}")
+```
--- a/docs/snippets/elixir/config/postprocessor_config.md
+++ b/docs/snippets/elixir/config/postprocessor_config.md
@@ -0,0 +1,14 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "postprocessor" => %{
+    "enabled" => true,
+    "enabled_processors" => [
+      "whitespace_normalizer",
+      "unicode_normalizer"
+    ]
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Processed content: #{result.content}")
+```
--- a/docs/snippets/elixir/config/quality_processing_config.md
+++ b/docs/snippets/elixir/config/quality_processing_config.md
@@ -0,0 +1,10 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "enable_quality_processing" => true,
+  "use_cache" => true
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Quality score: #{result.quality_score}")
+IO.puts("Processing time: #{inspect(result.processing_time)}")
+```
--- a/docs/snippets/elixir/config/tesseract_config.md
+++ b/docs/snippets/elixir/config/tesseract_config.md
@@ -0,0 +1,15 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "ocr" => %{
+    "backend" => "tesseract",
+    "language" => "eng+deu",
+    "tesseract_config" => %{
+      "psm" => 6,
+      "oem" => 3
+    }
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("scanned.pdf", "application/pdf", config_json)
+IO.puts("OCR text: #{result.content}")
+```
--- a/docs/snippets/elixir/config/token_reduction_config.md
+++ b/docs/snippets/elixir/config/token_reduction_config.md
@@ -0,0 +1,12 @@
+```elixir title="Elixir"
+config_json = Jason.encode!(%{
+  "token_reduction" => %{
+    "mode" => "moderate",
+    "preserve_important_words" => true
+  }
+})
+
+{:ok, result} = Kreuzberg.extract_file_sync("document.pdf", "application/pdf", config_json)
+IO.puts("Original tokens: #{result.token_count}")
+IO.puts("Reduced content: #{result.content}")
+```