Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/ruby/config/advanced_config.md
+++ b/docs/snippets/ruby/config/advanced_config.md
@@ -0,0 +1,22 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'tesseract',
+    language: 'eng+deu'
+  ),
+  chunking: Kreuzberg::ChunkingConfig.new(
+    max_characters: 1000,
+    overlap: 100
+  ),
+  language_detection: Kreuzberg::LanguageDetectionConfig.new,
+  use_cache: true,
+  enable_quality_processing: true
+)
+
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+
+result.chunks&.each { |chunk| puts chunk[0..100] }
+puts "Languages: #{result.detected_languages.inspect}"
+```
--- a/docs/snippets/ruby/config/chunking_config.md
+++ b/docs/snippets/ruby/config/chunking_config.md
@@ -0,0 +1,55 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  chunking: Kreuzberg::ChunkingConfig.new(
+    max_characters: 1000,
+    overlap: 200
+  )
+)
+```
+
+```ruby title="Ruby - Markdown with Heading Context"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  chunking: Kreuzberg::ChunkingConfig.new(
+    chunker_type: "markdown",
+    max_characters: 500,
+    overlap: 50,
+    sizing_type: "tokenizer",
+    sizing_model: "Xenova/gpt-4o"
+  )
+)
+
+result = Kreuzberg.extract_file("document.md", config)
+
+result.chunks.each do |chunk|
+  if chunk.metadata.heading_context
+    puts "Headings:"
+    chunk.metadata.heading_context.headings.each do |heading|
+      puts "  #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
+    end
+  end
+end
+```
+
+```ruby title="Ruby - Prepend Heading Context"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  chunking: Kreuzberg::ChunkingConfig.new(
+    chunker_type: "markdown",
+    max_characters: 500,
+    overlap: 50,
+    prepend_heading_context: true
+  )
+)
+
+result = Kreuzberg.extract_file("document.md", config)
+
+result.chunks.each do |chunk|
+  # Each chunk's content is prefixed with its heading breadcrumb
+  puts chunk.content[0, 100]
+end
+```
--- a/docs/snippets/ruby/config/config_basic.md
+++ b/docs/snippets/ruby/config/config_basic.md
@@ -0,0 +1,10 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  use_cache: true,
+  enable_quality_processing: true
+)
+
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+```
--- a/docs/snippets/ruby/config/config_discover.md
+++ b/docs/snippets/ruby/config/config_discover.md
@@ -0,0 +1,6 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.discover
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+```
--- a/docs/snippets/ruby/config/config_ocr.md
+++ b/docs/snippets/ruby/config/config_ocr.md
@@ -0,0 +1,11 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'tesseract',
+    language: 'eng+fra',
+    tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
+  )
+)
+```
--- a/docs/snippets/ruby/config/config_programmatic.md
+++ b/docs/snippets/ruby/config/config_programmatic.md
@@ -0,0 +1,19 @@
+```ruby title="Ruby"
+require "kreuzberg"
+
+config = Kreuzberg::ExtractionConfig.new(
+  use_cache: true,
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: "tesseract",
+    language: "eng+deu",
+    tesseract: Kreuzberg::TesseractConfig.new(psm: 6)
+  ),
+  chunking: Kreuzberg::ChunkingConfig.new(
+    max_characters: 1000,
+    overlap: 200
+  )
+)
+
+result = Kreuzberg.extract_file_sync("document.pdf", config)
+puts "Content length: #{result.content.length}"
+```
--- a/docs/snippets/ruby/config/document_structure_config.md
+++ b/docs/snippets/ruby/config/document_structure_config.md
@@ -0,0 +1,15 @@
+```ruby title="Document Structure Config (Ruby)"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(include_document_structure: true)
+
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+
+if result.document
+  result.document['nodes'].each do |node|
+    node_type = node['content']['node_type']
+    text = node['content']['text'] || ''
+    puts "[#{node_type}] #{text[0...80]}"
+  end
+end
+```
--- a/docs/snippets/ruby/config/element_based_output.md
+++ b/docs/snippets/ruby/config/element_based_output.md
@@ -0,0 +1,31 @@
+```ruby title="Element-Based Output (Ruby)"
+require 'kreuzberg'
+
+# Configure element-based output
+config = Kreuzberg::ExtractionConfig.new(output_format: 'element_based')
+
+# Extract document
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+
+# Access elements
+result.elements.each do |element|
+  puts "Type: #{element.element_type}"
+  puts "Text: #{element.text[0...100]}"
+
+  puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
+
+  if element.metadata.coordinates
+    coords = element.metadata.coordinates
+    puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
+  end
+
+  puts "---"
+end
+
+# Filter by element type
+titles = result.elements.select { |e| e.element_type == 'title' }
+titles.each do |title|
+  level = title.metadata.additional['level'] || 'unknown'
+  puts "[#{level}] #{title.text}"
+end
+```
--- a/docs/snippets/ruby/config/embedding_config.md
+++ b/docs/snippets/ruby/config/embedding_config.md
@@ -0,0 +1,18 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  chunking: Kreuzberg::ChunkingConfig.new(
+    max_characters: 1000,
+    embedding: Kreuzberg::EmbeddingConfig.new(
+      model: Kreuzberg::EmbeddingModelType.new(
+        type: 'preset',
+        name: 'all-mpnet-base-v2'
+      ),
+      batch_size: 16,
+      normalize: true,
+      show_download_progress: true
+    )
+  )
+)
+```
--- a/docs/snippets/ruby/config/embedding_config.rb
+++ b/docs/snippets/ruby/config/embedding_config.rb
@@ -0,0 +1,89 @@
+require 'kreuzberg'
+
+# Example 1: Preset model (recommended)
+# Fast, balanced, or quality preset configurations optimized for common use cases.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: { type: :preset, name: "balanced" },
+  batch_size: 32,
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: "~/.cache/kreuzberg/embeddings"
+)
+
+# Available presets:
+# - "fast" (384 dims): Quick prototyping, development, resource-constrained
+# - "balanced" (768 dims): Production, general-purpose RAG, English documents
+# - "quality" (1024 dims): Complex documents, maximum accuracy
+# - "multilingual" (768 dims): International documents, 100+ languages
+
+
+# Example 2: Custom ONNX model (requires embeddings feature)
+# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: {
+    type: :custom,
+    model_id: "BAAI/bge-small-en-v1.5",
+    dimensions: 384
+  },
+  batch_size: 32,
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: nil  # Uses default: .kreuzberg/embeddings/
+)
+
+# Popular ONNX-compatible models:
+# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+
+# Example 3: Alternative Custom ONNX Model
+# For advanced users wanting different ONNX embedding models.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: {
+    type: :custom,
+    model_id: "sentence-transformers/all-mpnet-base-v2",
+    dimensions: 768
+  },
+  batch_size: 16,  # Larger model requires smaller batch size
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: "/var/cache/embeddings"
+)
+
+
+# Integration with ChunkingConfig
+# Add embeddings to your chunking configuration:
+chunking_config = Kreuzberg::ChunkingConfig.new(
+  max_characters: 1024,
+  overlap: 100,
+  preset: "balanced",
+  embedding: Kreuzberg::EmbeddingConfig.new(
+    model: { type: :preset, name: "balanced" },
+    batch_size: 32,
+    normalize: true
+  )
+)
+
+extraction_config = Kreuzberg::ExtractionConfig.new(
+  chunking: chunking_config
+)
+
+
+# Key parameter explanations:
+#
+# batch_size: Number of texts to embed at once (32-128 typical)
+#   - Larger batches are faster but use more memory
+#   - Smaller batches for resource-constrained environments
+#
+# normalize: Whether to normalize vectors (L2 norm)
+#   - true (recommended): Enables cosine similarity in vector DBs
+#   - false: Raw embedding values
+#
+# cache_dir: Where to store downloaded models
+#   - nil: Uses .kreuzberg/embeddings/ in current directory
+#   - String: Custom directory for model storage
+#
+# show_download_progress: Display download progress bar
+#   - Useful for monitoring large model downloads
--- a/docs/snippets/ruby/config/hierarchy_config.rb
+++ b/docs/snippets/ruby/config/hierarchy_config.rb
@@ -0,0 +1,87 @@
+require 'kreuzberg'
+
+# Example 1: Basic hierarchy extraction
+# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
+# Extract bounding box information for spatial layout awareness.
+hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
+  enabled: true,
+  k_clusters: 6,  # Default: creates 6 font size clusters (H1-H6 structure)
+  include_bbox: true,  # Include bounding box coordinates
+  ocr_coverage_threshold: nil  # No OCR coverage threshold
+)
+
+pdf_config_basic = Kreuzberg::PdfConfig.new(
+  hierarchy: hierarchy_config_basic
+)
+
+extraction_config_basic = Kreuzberg::ExtractionConfig.new(
+  pdf_options: pdf_config_basic
+)
+
+# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
+
+
+# Example 2: Custom k_clusters for minimal structure
+# Use 3 clusters for simpler hierarchy with minimal structure.
+# Useful when you only need major section divisions (Main, Subsection, Detail).
+hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
+  enabled: true,
+  k_clusters: 3,  # Minimal clustering: just 3 levels
+  include_bbox: true,
+  ocr_coverage_threshold: nil
+)
+
+pdf_config_minimal = Kreuzberg::PdfConfig.new(
+  hierarchy: hierarchy_config_minimal
+)
+
+extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
+  pdf_options: pdf_config_minimal
+)
+
+# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
+
+
+# Example 3: With OCR coverage threshold
+# Trigger OCR if less than 50% of text has font data.
+# Useful for documents with mixed digital and scanned content.
+hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
+  enabled: true,
+  k_clusters: 6,
+  include_bbox: true,
+  ocr_coverage_threshold: 0.5  # Trigger OCR if text coverage < 50%
+)
+
+pdf_config_ocr = Kreuzberg::PdfConfig.new(
+  hierarchy: hierarchy_config_ocr
+)
+
+extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
+  pdf_options: pdf_config_ocr
+)
+
+# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
+
+
+# Field descriptions:
+#
+# enabled: boolean (default: true)
+#   - Enable or disable hierarchy extraction
+#   - When false, hierarchy structure is not analyzed
+#
+# k_clusters: integer (default: 6, valid: 1-7)
+#   - Number of font size clusters for hierarchy levels
+#   - 6 provides H1-H6 heading levels with body text
+#   - Higher values create more fine-grained hierarchy
+#   - Lower values create simpler structure
+#
+# include_bbox: boolean (default: true)
+#   - Include bounding box coordinates in hierarchy blocks
+#   - Required for spatial layout awareness and document structure
+#   - Set to false only if space optimization is critical
+#
+# ocr_coverage_threshold: float | nil (default: nil)
+#   - Range: 0.0 to 1.0
+#   - Triggers OCR when text block coverage falls below this fraction
+#   - Example: 0.5 means "run OCR if less than 50% of page has text data"
+#   - nil means no OCR coverage-based triggering
--- a/docs/snippets/ruby/config/html_output.md
+++ b/docs/snippets/ruby/config/html_output.md
@@ -0,0 +1,14 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  output_format: 'html',
+  html_output: Kreuzberg::HtmlOutputConfig.new(
+    theme: 'git_hub',
+    embed_css: true
+  )
+)
+
+result = Kreuzberg.extract_file_sync('document.pdf', nil, config)
+puts result.content # HTML with kb-* classes
+```
--- a/docs/snippets/ruby/config/keyword_config.rb
+++ b/docs/snippets/ruby/config/keyword_config.rb
@@ -0,0 +1,65 @@
+require 'kreuzberg'
+
+# Example 1: Basic YAKE configuration
+# Uses YAKE algorithm with default parameters and English stopword filtering
+def basic_yake
+  config = Kreuzberg::ExtractionConfig.new(
+    keywords: Kreuzberg::KeywordConfig.new(
+      algorithm: :yake,
+      max_keywords: 10,
+      min_score: 0.0,
+      ngram_range: [1, 3],
+      language: 'en',
+      yake_params: nil,
+      rake_params: nil
+    )
+  )
+
+  result = Kreuzberg.extract_file('document.pdf', config: config)
+  puts "Keywords: #{result.keywords}"
+end
+
+# Example 2: Advanced YAKE with custom parameters
+# Fine-tunes YAKE with custom window size for co-occurrence analysis
+def advanced_yake
+  config = Kreuzberg::ExtractionConfig.new(
+    keywords: Kreuzberg::KeywordConfig.new(
+      algorithm: :yake,
+      max_keywords: 15,
+      min_score: 0.1,
+      ngram_range: [1, 2],
+      language: 'en',
+      yake_params: Kreuzberg::YakeParams.new(
+        window_size: 1
+      ),
+      rake_params: nil
+    )
+  )
+
+  result = Kreuzberg.extract_file('document.pdf', config: config)
+  puts "Keywords: #{result.keywords}"
+end
+
+# Example 3: RAKE configuration
+# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
+def rake_config
+  config = Kreuzberg::ExtractionConfig.new(
+    keywords: Kreuzberg::KeywordConfig.new(
+      algorithm: :rake,
+      max_keywords: 10,
+      min_score: 5.0,
+      ngram_range: [1, 3],
+      language: 'en',
+      yake_params: nil,
+      rake_params: Kreuzberg::RakeParams.new(
+        min_word_length: 1,
+        max_words_per_phrase: 3
+      )
+    )
+  )
+
+  result = Kreuzberg.extract_file('document.pdf', config: config)
+  puts "Keywords: #{result.keywords}"
+end
+
+basic_yake if __FILE__ == $0
--- a/docs/snippets/ruby/config/keyword_extraction_config.md
+++ b/docs/snippets/ruby/config/keyword_extraction_config.md
@@ -0,0 +1,13 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  keywords: Kreuzberg::KeywordConfig.new(
+    algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
+    max_keywords: 10,
+    min_score: 0.3,
+    ngram_range: [1, 3],
+    language: 'en'
+  )
+)
+```
--- a/docs/snippets/ruby/config/language_detection_config.md
+++ b/docs/snippets/ruby/config/language_detection_config.md
@@ -0,0 +1,11 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  language_detection: Kreuzberg::LanguageDetectionConfig.new(
+    enabled: true,
+    min_confidence: 0.8,
+    detect_multiple: false
+  )
+)
+```
--- a/docs/snippets/ruby/config/ocr_dpi_config.md
+++ b/docs/snippets/ruby/config/ocr_dpi_config.md
@@ -0,0 +1,10 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
+  pdf: Kreuzberg::PdfConfig.new(dpi: 300)
+)
+
+result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
+```
--- a/docs/snippets/ruby/config/pdf_config.md
+++ b/docs/snippets/ruby/config/pdf_config.md
@@ -0,0 +1,16 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  pdf_options: Kreuzberg::PdfConfig.new(
+    extract_images: true,
+    extract_metadata: true,
+    passwords: ['password1', 'password2'],
+    hierarchy: Kreuzberg::HierarchyConfig.new(
+      enabled: true,
+      k_clusters: 6,
+      include_bbox: true
+    )
+  )
+)
+```
--- a/docs/snippets/ruby/config/pdf_hierarchy_config.md
+++ b/docs/snippets/ruby/config/pdf_hierarchy_config.md
@@ -0,0 +1,29 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+# Using keyword arguments with defaults
+config = Kreuzberg::ExtractionConfig.new(
+  pdf_options: Kreuzberg::PdfConfig.new(
+    extract_images: true,
+    hierarchy: Kreuzberg::HierarchyConfig.new(
+      enabled: true,
+      k_clusters: 6,
+      include_bbox: true,
+      ocr_coverage_threshold: 0.8
+    )
+  )
+)
+
+# Using hash syntax alternative
+config = Kreuzberg::ExtractionConfig.new(
+  pdf_options: Kreuzberg::PdfConfig.new(
+    extract_images: true,
+    hierarchy: {
+      enabled: true,
+      k_clusters: 6,
+      include_bbox: true,
+      ocr_coverage_threshold: 0.8
+    }
+  )
+)
+```
--- a/docs/snippets/ruby/config/postprocessor_config.md
+++ b/docs/snippets/ruby/config/postprocessor_config.md
@@ -0,0 +1,11 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  postprocessor: Kreuzberg::PostProcessorConfig.new(
+    enabled: true,
+    enabled_processors: ['deduplication', 'whitespace_normalization'],
+    disabled_processors: ['mojibake_fix']
+  )
+)
+```
--- a/docs/snippets/ruby/config/quality_processing_config.md
+++ b/docs/snippets/ruby/config/quality_processing_config.md
@@ -0,0 +1,7 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  enable_quality_processing: true
+)
+```
--- a/docs/snippets/ruby/config/tesseract_config.md
+++ b/docs/snippets/ruby/config/tesseract_config.md
@@ -0,0 +1,16 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    language: 'eng+fra+deu',
+    tesseract_config: Kreuzberg::TesseractConfig.new(
+      psm: 6,
+      oem: 1,
+      min_confidence: 0.8,
+      tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
+      enable_table_detection: true
+    )
+  )
+)
+```
--- a/docs/snippets/ruby/config/token_reduction_config.md
+++ b/docs/snippets/ruby/config/token_reduction_config.md
@@ -0,0 +1,12 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  token_reduction: Kreuzberg::TokenReductionConfig.new(
+    mode: 'moderate',
+    preserve_markdown: true,
+    preserve_code: true,
+    language_hint: 'eng'
+  )
+)
+```