Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/ruby/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/ruby/ocr/cloud_ocr_backend.md
@@ -0,0 +1,32 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+require 'net/http'
+
+class CloudOcrBackend
+  def name
+    'cloud-ocr'
+  end
+
+  def supported_languages
+    %w[eng fra deu]
+  end
+
+  def process_image(image_data, language)
+    uri = URI('https://api.example.com/ocr')
+    req = Net::HTTP::Post.new(uri)
+    req['Authorization'] = "Bearer #{ENV['OCR_API_KEY']}"
+    req.body = image_data
+    res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |h| h.request(req) }
+    raise StandardError, res.message unless res.is_a?(Net::HTTPSuccess)
+    { content: JSON.parse(res.body)['text'] }
+  rescue StandardError => e
+    raise StandardError, e.message
+  end
+end
+
+Kreuzberg.register_ocr_backend(CloudOcrBackend.new)
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(backend: 'cloud-ocr')
+)
+Kreuzberg.extract_file_sync('doc.pdf', config: config)
+```
--- a/docs/snippets/ruby/ocr/image_extraction.md
+++ b/docs/snippets/ruby/ocr/image_extraction.md
@@ -0,0 +1,13 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  images: Kreuzberg::ImageExtractionConfig.new(
+    extract_images: true,
+    target_dpi: 200,
+    max_image_dimension: 2048,
+    inject_placeholders: true, # set to false to extract images without markdown references
+    auto_adjust_dpi: true
+  )
+)
+```
--- a/docs/snippets/ruby/ocr/image_preprocessing.md
+++ b/docs/snippets/ruby/ocr/image_preprocessing.md
@@ -0,0 +1,17 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    tesseract_config: Kreuzberg::TesseractConfig.new(
+      preprocessing: Kreuzberg::ImagePreprocessingConfig.new(
+        target_dpi: 300,
+        denoise: true,
+        deskew: true,
+        contrast_enhance: true,
+        binarization_method: 'otsu'
+      )
+    )
+  )
+)
+```
--- a/docs/snippets/ruby/ocr/ocr_easyocr.md
+++ b/docs/snippets/ruby/ocr/ocr_easyocr.md
@@ -0,0 +1,14 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'easyocr',
+    language: 'eng'
+  )
+)
+
+result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
+puts result.content[0..100]
+puts "Total length: #{result.content.length}"
+```
--- a/docs/snippets/ruby/ocr/ocr_elements.md
+++ b/docs/snippets/ruby/ocr/ocr_elements.md
@@ -0,0 +1,22 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'paddleocr',
+    language: 'eng'
+  )
+)
+
+result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
+
+result.ocr_elements&.each do |element|
+  puts "Text: #{element.text}"
+  puts "Confidence: #{format('%.2f', element.confidence.recognition)}"
+  puts "Geometry: #{element.geometry}"
+  if element.rotation
+    puts "Rotation: #{element.rotation.angle}°"
+  end
+  puts
+end
+```
--- a/docs/snippets/ruby/ocr/ocr_extraction.md
+++ b/docs/snippets/ruby/ocr/ocr_extraction.md
@@ -0,0 +1,12 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+ocr_config = Kreuzberg::OcrConfig.new(
+  backend: 'tesseract',
+  language: 'eng'
+)
+
+config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
+result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
+puts result.content
+```
--- a/docs/snippets/ruby/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/ruby/ocr/ocr_force_all_pages.md
@@ -0,0 +1,11 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
+  force_ocr: true
+)
+
+result = Kreuzberg.extract_file_sync('document.pdf', config: config)
+puts result.content
+```
--- a/docs/snippets/ruby/ocr/ocr_multi_language.md
+++ b/docs/snippets/ruby/ocr/ocr_multi_language.md
@@ -0,0 +1,13 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'tesseract',
+    language: 'eng+deu+fra'
+  )
+)
+
+result = Kreuzberg.extract_file_sync('multilingual.pdf', config: config)
+puts result.content
+```
--- a/docs/snippets/ruby/ocr/ocr_paddleocr.md
+++ b/docs/snippets/ruby/ocr/ocr_paddleocr.md
@@ -0,0 +1,15 @@
+```ruby title="Ruby"
+require 'kreuzberg'
+
+config = Kreuzberg::ExtractionConfig.new(
+  ocr: Kreuzberg::OcrConfig.new(
+    backend: 'paddleocr',
+    language: 'eng'
+    # model_tier: 'server' # for max accuracy
+  )
+)
+
+result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
+puts result.content[0..100]
+puts "Total length: #{result.content.length}"
+```