This commit is contained in:
32
docs/snippets/ruby/ocr/cloud_ocr_backend.md
Normal file
32
docs/snippets/ruby/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'net/http'
|
||||
|
||||
class CloudOcrBackend
|
||||
def name
|
||||
'cloud-ocr'
|
||||
end
|
||||
|
||||
def supported_languages
|
||||
%w[eng fra deu]
|
||||
end
|
||||
|
||||
def process_image(image_data, language)
|
||||
uri = URI('https://api.example.com/ocr')
|
||||
req = Net::HTTP::Post.new(uri)
|
||||
req['Authorization'] = "Bearer #{ENV['OCR_API_KEY']}"
|
||||
req.body = image_data
|
||||
res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |h| h.request(req) }
|
||||
raise StandardError, res.message unless res.is_a?(Net::HTTPSuccess)
|
||||
{ content: JSON.parse(res.body)['text'] }
|
||||
rescue StandardError => e
|
||||
raise StandardError, e.message
|
||||
end
|
||||
end
|
||||
|
||||
Kreuzberg.register_ocr_backend(CloudOcrBackend.new)
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'cloud-ocr')
|
||||
)
|
||||
Kreuzberg.extract_file_sync('doc.pdf', config: config)
|
||||
```
|
||||
13
docs/snippets/ruby/ocr/image_extraction.md
Normal file
13
docs/snippets/ruby/ocr/image_extraction.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
images: Kreuzberg::ImageExtractionConfig.new(
|
||||
extract_images: true,
|
||||
target_dpi: 200,
|
||||
max_image_dimension: 2048,
|
||||
inject_placeholders: true, # set to false to extract images without markdown references
|
||||
auto_adjust_dpi: true
|
||||
)
|
||||
)
|
||||
```
|
||||
17
docs/snippets/ruby/ocr/image_preprocessing.md
Normal file
17
docs/snippets/ruby/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(
|
||||
preprocessing: Kreuzberg::ImagePreprocessingConfig.new(
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: 'otsu'
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
14
docs/snippets/ruby/ocr/ocr_easyocr.md
Normal file
14
docs/snippets/ruby/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'easyocr',
|
||||
language: 'eng'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content[0..100]
|
||||
puts "Total length: #{result.content.length}"
|
||||
```
|
||||
22
docs/snippets/ruby/ocr/ocr_elements.md
Normal file
22
docs/snippets/ruby/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'paddleocr',
|
||||
language: 'eng'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
|
||||
result.ocr_elements&.each do |element|
|
||||
puts "Text: #{element.text}"
|
||||
puts "Confidence: #{format('%.2f', element.confidence.recognition)}"
|
||||
puts "Geometry: #{element.geometry}"
|
||||
if element.rotation
|
||||
puts "Rotation: #{element.rotation.angle}°"
|
||||
end
|
||||
puts
|
||||
end
|
||||
```
|
||||
12
docs/snippets/ruby/ocr/ocr_extraction.md
Normal file
12
docs/snippets/ruby/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
ocr_config = Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
11
docs/snippets/ruby/ocr/ocr_force_all_pages.md
Normal file
11
docs/snippets/ruby/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
|
||||
force_ocr: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
13
docs/snippets/ruby/ocr/ocr_multi_language.md
Normal file
13
docs/snippets/ruby/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu+fra'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
15
docs/snippets/ruby/ocr/ocr_paddleocr.md
Normal file
15
docs/snippets/ruby/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'paddleocr',
|
||||
language: 'eng'
|
||||
# model_tier: 'server' # for max accuracy
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content[0..100]
|
||||
puts "Total length: #{result.content.length}"
|
||||
```
|
||||
Reference in New Issue
Block a user