Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(max_characters: 500, overlap: 50),
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each do |chunk|
first = chunk.metadata.first_page
last = chunk.metadata.last_page
next if first.nil?
page_range = first == last ? "Page #{first}" : "Pages #{first}-#{last}"
puts "Chunk: #{chunk.content[0..50]}... (#{page_range})"
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-minilm-l6-v2'
),
normalize: true,
batch_size: 32
)
)
)
```
```ruby title="Ruby - Prepend Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
prepend_heading_context: true
)
)
```

View File

@@ -0,0 +1,45 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 500,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
normalize: true,
batch_size: 16
)
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
vector_store = build_vector_store(result.chunks)
query = 'machine learning optimization'
relevant_chunks = search_vector_store(vector_store, query)
puts "Found #{relevant_chunks.length} relevant chunks"
relevant_chunks.take(3).each do |chunk|
puts "Content: #{chunk[:content][0..80]}..."
puts "Similarity: #{chunk[:similarity]&.round(3)}\n"
end
def build_vector_store(chunks)
chunks.map.with_index do |chunk, idx|
{
id: idx,
content: chunk.content,
embedding: chunk.embedding,
similarity: 0.0
}
end
end
def search_vector_store(store, query)
store.sort_by { |entry| entry[:similarity] }.reverse
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32,
show_download_progress: false
)
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
chunks = result.chunks || []
chunks.each_with_index do |chunk, idx|
chunk_id = "doc_chunk_#{idx}"
puts "Chunk #{chunk_id}: #{chunk.content[0...50]}"
if chunk.embedding
puts " Embedding dimensions: #{chunk.embedding.length}"
end
end
```

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3,
ngram_range: [1, 3],
language: 'en'
)
)
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.metadata&.dig('keywords') || []
keywords.each do |kw|
text = kw['text']
score = kw['score']
puts "#{text}: #{score.round(3)}"
end
```

View File

@@ -0,0 +1,21 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: false
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
if result.detected_languages&.any?
puts "Detected Language: #{result.detected_languages.first}"
else
puts "No language detected"
end
puts "Content length: #{result.content.length} characters"
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: true
)
)
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
languages = result.detected_languages || []
if languages.any?
puts "Detected #{languages.length} language(s): #{languages.join(', ')}"
else
puts "No languages detected"
end
puts "Total content: #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
quality_score = result.quality_score || 0.0
if quality_score < 0.5
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
else
puts "Quality score: #{quality_score.round(2)}"
end
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true,
preserve_code: true,
language_hint: 'eng'
)
)
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true
)
)
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
original_tokens = result.metadata&.dig('original_token_count') || 0
reduced_tokens = result.metadata&.dig('token_count') || 0
reduction_ratio = result.metadata&.dig('token_reduction_ratio') || 0.0
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
puts "Reduction: #{(reduction_ratio * 100).round(1)}%"
```

View File

@@ -0,0 +1,53 @@
```ruby title="Ruby"
require 'kreuzberg'
class VectorDatabaseIntegration
VectorRecord = Struct.new(:id, :embedding, :content, :metadata, keyword_init: true)
def extract_and_vectorize(document_path, document_id)
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32
)
)
)
result = Kreuzberg.extract_file_sync(document_path, config: config)
chunks = result.chunks || []
vector_records = chunks.map.with_index do |chunk, idx|
VectorRecord.new(
id: "#{document_id}_chunk_#{idx}",
content: chunk.content,
embedding: chunk.embedding,
metadata: {
document_id: document_id,
chunk_index: idx,
content_length: chunk.content.length
}
)
end
store_in_vector_database(vector_records)
vector_records
end
private
def store_in_vector_database(records)
records.each do |record|
if record.embedding&.any?
puts "Storing #{record.id}: #{record.content.length} chars, #{record.embedding.length} dims"
end
end
end
end
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
items = [
Kreuzberg::BatchBytesItem.new(
content: File.read('doc1.pdf'),
mime_type: 'application/pdf'
),
Kreuzberg::BatchBytesItem.new(
content: File.read('doc2.docx'),
mime_type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
),
Kreuzberg::BatchBytesItem.new(
content: File.read('doc3.xlsx'),
mime_type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
]
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
results = Kreuzberg.batch_extract_bytes_sync(items, config: config)
results.each { |result| puts "Extracted: #{result.content.length} chars" }
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
items = [
Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
]
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
results = Kreuzberg.batch_extract_files_sync(items, config: config)
results.each_with_index do |result, idx|
puts "Document #{idx + 1}:"
puts " Extracted: #{result.content.length} characters"
puts " Quality: #{result.quality_score}"
puts " MIME: #{result.mime_type}"
end
```

View File

@@ -0,0 +1,36 @@
```ruby title="Ruby"
require 'net/http'
require 'json'
uri = URI('http://localhost:8000/chunk')
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri)
request['Content-Type'] = 'application/json'
payload = {
text: 'Your long text content here...',
chunker_type: 'text',
config: {
max_characters: 1000,
overlap: 50,
trim: true
}
}
request.body = JSON.generate(payload)
response = http.request(request)
if response.is_a?(Net::HTTPSuccess)
result = JSON.parse(response.body)
puts "Created #{result['chunk_count']} chunks"
result['chunks'].each do |chunk|
preview = chunk['content'][0..49]
puts "Chunk #{chunk['chunk_index']}: #{preview}..."
end
else
puts "Error: #{response.code} #{response.message}"
end
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'net/http'
require 'json'
uri = URI('http://localhost:8000/extract')
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri)
File.open('document.pdf', 'rb') do |file|
body = file.read
request['Content-Type'] = 'application/octet-stream'
request.body = body
response = http.request(request)
if response.is_a?(Net::HTTPSuccess)
data = JSON.parse(response.body)
puts JSON.pretty_generate(data)
else
puts "Error: #{response.code} #{response.message}"
end
end
```

View File

@@ -0,0 +1,45 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true,
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
detect_multiple: true,
min_confidence: 0.8
),
token_reduction: Kreuzberg::TokenReductionOptions.new(
mode: 'moderate',
preserve_important_words: true
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: { type: 'preset', name: 'text-embedding-all-minilm-l6-v2' }
)
),
keywords: Kreuzberg::KeywordConfig.new(
algorithm: 'yake',
max_keywords: 10
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Content length: #{result.content.length} characters"
puts "Quality score: #{result.quality_score}"
puts "Detected languages: #{result.detected_languages&.join(', ')}"
puts "Total chunks: #{result.chunks&.length || 0}"
puts "Keywords: #{result.extracted_keywords&.map(&:text)&.join(', ')}"
if result.chunks && result.chunks.length > 0
first_chunk = result.chunks[0]
puts "First chunk size: #{first_chunk.content.length} chars"
puts "Embedding dims: #{first_chunk.embedding&.length || 0}"
end
```

View File

@@ -0,0 +1,21 @@
```ruby title="Ruby"
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('missing.pdf')
puts result.content
rescue RuntimeError => e
# All extraction errors are raised as RuntimeError
# Check error message for specific error details
case e.message
when /validation/i
puts "Validation error: #{e.message}"
when /io|not found/i
puts "IO error: #{e.message}"
raise
else
puts "Extraction failed: #{e.message}"
raise
end
end
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
begin
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new
result = Kreuzberg.extract_bytes_sync(pdf_bytes, 'application/pdf', config: config)
puts "Extracted #{result.content.length} characters"
rescue RuntimeError => e
# All extraction errors are raised as RuntimeError
# Check error message for details
case e.message
when /parse|parsing/i
puts "Failed to parse document: #{e.message}"
when /ocr/i
puts "OCR processing failed: #{e.message}"
when /validation|invalid/i
puts "Invalid configuration: #{e.message}"
else
puts "Extraction error: #{e.message}"
end
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_bytes_async(
pdf_bytes,
'application/pdf',
config: config
)
puts "Async bytes extraction done"
puts "Content preview: #{result.content[0..100]}"
puts "Quality score: #{result.quality_score}"
```

View File

@@ -0,0 +1,17 @@
```ruby title="Ruby"
require 'kreuzberg'
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
use_cache: true
)
result = Kreuzberg.extract_bytes_sync(
pdf_bytes,
'application/pdf',
config: config
)
puts "Extracted #{result.content.length} characters"
puts "Detected MIME: #{result.mime_type}"
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: false,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_async('document.pdf', config: config)
puts "Async extraction complete"
puts "Extracted #{result.content.length} characters"
puts "Quality: #{result.quality_score}"
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Extracted #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
puts "Quality score: #{result.quality_score}"
```

View File

@@ -0,0 +1,46 @@
```ruby title="simple_benchmark.rb"
require 'kreuzberg'
require 'benchmark'
config = Kreuzberg::ExtractionConfig.new(use_cache: false)
kreuzberg = Kreuzberg::Client.new(config)
file_path = 'document.pdf'
num_runs = 10
puts "Sync extraction (#{num_runs} runs):"
sync_time = Benchmark.realtime do
num_runs.times do
kreuzberg.extract_file(file_path)
end
end
avg_sync = sync_time / num_runs
puts " - Total time: #{sync_time.round(3)}s"
puts " - Average: #{avg_sync.round(3)}s per extraction"
puts "\nAsync extraction (#{num_runs} parallel runs):"
async_time = Benchmark.realtime do
threads = num_runs.times.map do
Thread.new { kreuzberg.extract_file(file_path) }
end
threads.map(&:join)
end
puts " - Total time: #{async_time.round(3)}s"
puts " - Average: #{(async_time / num_runs).round(3)}s per extraction"
puts " - Speedup: #{(sync_time / async_time).round(1)}x"
cache_config = Kreuzberg::ExtractionConfig.new(use_cache: true)
kreuzberg_cached = Kreuzberg::Client.new(cache_config)
puts "\nFirst extraction (populates cache)..."
first_time = Benchmark.realtime do
kreuzberg_cached.extract_file(file_path)
end
puts " - Time: #{first_time.round(3)}s"
puts "Second extraction (from cache)..."
cached_time = Benchmark.realtime do
kreuzberg_cached.extract_file(file_path)
end
puts " - Time: #{cached_time.round(3)}s"
puts " - Cache speedup: #{(first_time / cached_time).round(1)}x"
```

37
docs/snippets/ruby/cache/disk_cache.rb vendored Normal file
View File

@@ -0,0 +1,37 @@
```ruby title="disk_cache.rb"
require 'kreuzberg'
require 'fileutils'
cache_dir = File.expand_path('~/.cache/kreuzberg')
FileUtils.mkdir_p(cache_dir)
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
cache_config: Kreuzberg::CacheConfig.new(
cache_path: cache_dir,
max_cache_size: 500 * 1024 * 1024,
cache_ttl_seconds: 7 * 86400,
enable_compression: true,
)
)
kreuzberg = Kreuzberg::Client.new(config)
puts "First extraction (will be cached)..."
result1 = kreuzberg.extract_file('document.pdf')
puts " - Content length: #{result1.content.length}"
puts " - Cached: #{result1.metadata['was_cached']}"
puts "\nSecond extraction (from cache)..."
result2 = kreuzberg.extract_file('document.pdf')
puts " - Content length: #{result2.content.length}"
puts " - Cached: #{result2.metadata['was_cached']}"
puts "\nResults are identical: #{result1.content == result2.content}"
cache_stats = kreuzberg.get_cache_stats
puts "\nCache Statistics:"
puts " - Total entries: #{cache_stats['total_entries']}"
puts " - Cache size: #{(cache_stats['cache_size_bytes'] / 1024.0 / 1024.0).round(1)} MB"
puts " - Hit rate: #{(cache_stats['hit_rate'] * 100).round(1)}%"
```

View File

@@ -0,0 +1,26 @@
```ruby title="basic_cli.rb"
require 'json'
require 'open3'
def extract_with_cli(file_path, output_format = 'text')
stdout, stderr, status = Open3.capture3(
'kreuzberg', 'extract', file_path, '--format', output_format
)
unless status.success?
warn "Error: #{stderr}"
exit 1
end
return JSON.parse(stdout) if output_format == 'json'
stdout
end
document = 'document.pdf'
text_output = extract_with_cli(document, 'text')
puts "Extracted: #{text_output.length} characters"
json_output = extract_with_cli(document, 'json')
puts "Format: #{json_output['format']}"
```

View File

@@ -0,0 +1,27 @@
```ruby title="cli_with_config.rb"
require 'json'
require 'open3'
def extract_with_config(file_path, config_path)
stdout, stderr, status = Open3.capture3(
'kreuzberg', 'extract', file_path, '--config', config_path, '--format', 'json'
)
unless status.success?
warn "Error: #{stderr}"
exit 1
end
JSON.parse(stdout)
end
config_file = 'kreuzberg.toml'
document = 'document.pdf'
puts "Extracting #{document} with config #{config_file}"
result = extract_with_config(document, config_file)
puts "Content length: #{result['content'].length}"
puts "Format: #{result['format']}"
puts "Languages: #{result['languages'].join(', ')}"
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng+deu'
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 100
),
language_detection: Kreuzberg::LanguageDetectionConfig.new,
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each { |chunk| puts chunk[0..100] }
puts "Languages: #{result.detected_languages.inspect}"
```

View File

@@ -0,0 +1,55 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200
)
)
```
```ruby title="Ruby - Markdown with Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
sizing_type: "tokenizer",
sizing_model: "Xenova/gpt-4o"
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
if chunk.metadata.heading_context
puts "Headings:"
chunk.metadata.heading_context.headings.each do |heading|
puts " #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
end
end
end
```
```ruby title="Ruby - Prepend Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
prepend_heading_context: true
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
# Each chunk's content is prefixed with its heading breadcrumb
puts chunk.content[0, 100]
end
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
```

View File

@@ -0,0 +1,6 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng+fra',
tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
)
)
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require "kreuzberg"
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
ocr: Kreuzberg::OcrConfig.new(
backend: "tesseract",
language: "eng+deu",
tesseract: Kreuzberg::TesseractConfig.new(psm: 6)
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200
)
)
result = Kreuzberg.extract_file_sync("document.pdf", config)
puts "Content length: #{result.content.length}"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Document Structure Config (Ruby)"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(include_document_structure: true)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
if result.document
result.document['nodes'].each do |node|
node_type = node['content']['node_type']
text = node['content']['text'] || ''
puts "[#{node_type}] #{text[0...80]}"
end
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Element-Based Output (Ruby)"
require 'kreuzberg'
# Configure element-based output
config = Kreuzberg::ExtractionConfig.new(output_format: 'element_based')
# Extract document
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
# Access elements
result.elements.each do |element|
puts "Type: #{element.element_type}"
puts "Text: #{element.text[0...100]}"
puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
if element.metadata.coordinates
coords = element.metadata.coordinates
puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
end
puts "---"
end
# Filter by element type
titles = result.elements.select { |e| e.element_type == 'title' }
titles.each do |title|
level = title.metadata.additional['level'] || 'unknown'
puts "[#{level}] #{title.text}"
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
batch_size: 16,
normalize: true,
show_download_progress: true
)
)
)
```

View File

@@ -0,0 +1,89 @@
require 'kreuzberg'
# Example 1: Preset model (recommended)
# Fast, balanced, or quality preset configurations optimized for common use cases.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: { type: :preset, name: "balanced" },
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: "~/.cache/kreuzberg/embeddings"
)
# Available presets:
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
# - "quality" (1024 dims): Complex documents, maximum accuracy
# - "multilingual" (768 dims): International documents, 100+ languages
# Example 2: Custom ONNX model (requires embeddings feature)
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: {
type: :custom,
model_id: "BAAI/bge-small-en-v1.5",
dimensions: 384
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: nil # Uses default: .kreuzberg/embeddings/
)
# Popular ONNX-compatible models:
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
# Example 3: Alternative Custom ONNX Model
# For advanced users wanting different ONNX embedding models.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: {
type: :custom,
model_id: "sentence-transformers/all-mpnet-base-v2",
dimensions: 768
},
batch_size: 16, # Larger model requires smaller batch size
normalize: true,
show_download_progress: true,
cache_dir: "/var/cache/embeddings"
)
# Integration with ChunkingConfig
# Add embeddings to your chunking configuration:
chunking_config = Kreuzberg::ChunkingConfig.new(
max_characters: 1024,
overlap: 100,
preset: "balanced",
embedding: Kreuzberg::EmbeddingConfig.new(
model: { type: :preset, name: "balanced" },
batch_size: 32,
normalize: true
)
)
extraction_config = Kreuzberg::ExtractionConfig.new(
chunking: chunking_config
)
# Key parameter explanations:
#
# batch_size: Number of texts to embed at once (32-128 typical)
# - Larger batches are faster but use more memory
# - Smaller batches for resource-constrained environments
#
# normalize: Whether to normalize vectors (L2 norm)
# - true (recommended): Enables cosine similarity in vector DBs
# - false: Raw embedding values
#
# cache_dir: Where to store downloaded models
# - nil: Uses .kreuzberg/embeddings/ in current directory
# - String: Custom directory for model storage
#
# show_download_progress: Display download progress bar
# - Useful for monitoring large model downloads

View File

@@ -0,0 +1,87 @@
require 'kreuzberg'
# Example 1: Basic hierarchy extraction
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
# Extract bounding box information for spatial layout awareness.
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6, # Default: creates 6 font size clusters (H1-H6 structure)
include_bbox: true, # Include bounding box coordinates
ocr_coverage_threshold: nil # No OCR coverage threshold
)
pdf_config_basic = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_basic
)
extraction_config_basic = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_basic
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
# Example 2: Custom k_clusters for minimal structure
# Use 3 clusters for simpler hierarchy with minimal structure.
# Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 3, # Minimal clustering: just 3 levels
include_bbox: true,
ocr_coverage_threshold: nil
)
pdf_config_minimal = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_minimal
)
extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_minimal
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
# Example 3: With OCR coverage threshold
# Trigger OCR if less than 50% of text has font data.
# Useful for documents with mixed digital and scanned content.
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.5 # Trigger OCR if text coverage < 50%
)
pdf_config_ocr = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_ocr
)
extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_ocr
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
# Field descriptions:
#
# enabled: boolean (default: true)
# - Enable or disable hierarchy extraction
# - When false, hierarchy structure is not analyzed
#
# k_clusters: integer (default: 6, valid: 1-7)
# - Number of font size clusters for hierarchy levels
# - 6 provides H1-H6 heading levels with body text
# - Higher values create more fine-grained hierarchy
# - Lower values create simpler structure
#
# include_bbox: boolean (default: true)
# - Include bounding box coordinates in hierarchy blocks
# - Required for spatial layout awareness and document structure
# - Set to false only if space optimization is critical
#
# ocr_coverage_threshold: float | nil (default: nil)
# - Range: 0.0 to 1.0
# - Triggers OCR when text block coverage falls below this fraction
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
# - nil means no OCR coverage-based triggering

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
output_format: 'html',
html_output: Kreuzberg::HtmlOutputConfig.new(
theme: 'git_hub',
embed_css: true
)
)
result = Kreuzberg.extract_file_sync('document.pdf', nil, config)
puts result.content # HTML with kb-* classes
```

View File

@@ -0,0 +1,65 @@
require 'kreuzberg'
# Example 1: Basic YAKE configuration
# Uses YAKE algorithm with default parameters and English stopword filtering
def basic_yake
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :yake,
max_keywords: 10,
min_score: 0.0,
ngram_range: [1, 3],
language: 'en',
yake_params: nil,
rake_params: nil
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
# Example 2: Advanced YAKE with custom parameters
# Fine-tunes YAKE with custom window size for co-occurrence analysis
def advanced_yake
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :yake,
max_keywords: 15,
min_score: 0.1,
ngram_range: [1, 2],
language: 'en',
yake_params: Kreuzberg::YakeParams.new(
window_size: 1
),
rake_params: nil
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
# Example 3: RAKE configuration
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
def rake_config
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :rake,
max_keywords: 10,
min_score: 5.0,
ngram_range: [1, 3],
language: 'en',
yake_params: nil,
rake_params: Kreuzberg::RakeParams.new(
min_word_length: 1,
max_words_per_phrase: 3
)
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
basic_yake if __FILE__ == $0

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3,
ngram_range: [1, 3],
language: 'en'
)
)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: false
)
)
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
pdf: Kreuzberg::PdfConfig.new(dpi: 300)
)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
extract_metadata: true,
passwords: ['password1', 'password2'],
hierarchy: Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true
)
)
)
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
# Using keyword arguments with defaults
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
hierarchy: Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
)
)
)
# Using hash syntax alternative
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
hierarchy: {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
}
)
)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
postprocessor: Kreuzberg::PostProcessorConfig.new(
enabled: true,
enabled_processors: ['deduplication', 'whitespace_normalization'],
disabled_processors: ['mojibake_fix']
)
)
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
language: 'eng+fra+deu',
tesseract_config: Kreuzberg::TesseractConfig.new(
psm: 6,
oem: 1,
min_confidence: 0.8,
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
enable_table_detection: true
)
)
)
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true,
preserve_code: true,
language_hint: 'eng'
)
)
```

View File

@@ -0,0 +1,63 @@
```ruby title="usage.rb"
require 'open3'
require 'net/http'
require 'json'
class DockerKreuzbergClient
def initialize(container_name = 'kreuzberg-api', api_port = 8000)
@container_name = container_name
@api_port = api_port
@api_url = "http://localhost:#{api_port}/api/extract"
end
def start_container(image = 'kreuzberg:latest')
puts 'Starting Kreuzberg Docker container...'
cmd = "docker run -d --name #{@container_name} -p #{@api_port}:8000 #{image}"
stdout, stderr, status = Open3.capture3(cmd)
raise "Failed to start container: #{stderr}" unless status.success?
puts "Container started on http://localhost:#{@api_port}"
end
def extract_file(file_path)
file_content = File.read(file_path, mode: 'rb')
boundary = "----WebKitFormBoundary#{SecureRandom.hex(16)}"
body = "--#{boundary}\r\n"
body += "Content-Disposition: form-data; name=\"file\"; filename=\"#{File.basename(file_path)}\"\r\n"
body += "Content-Type: application/octet-stream\r\n\r\n"
body += file_content
body += "\r\n--#{boundary}--\r\n"
uri = URI(@api_url)
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri.path)
request['Content-Type'] = "multipart/form-data; boundary=#{boundary}"
request.body = body
response = http.request(request)
result = JSON.parse(response.body)
result['content']
end
def stop_container
puts 'Stopping Kreuzberg Docker container...'
system("docker stop #{@container_name}")
system("docker rm #{@container_name}")
puts 'Container stopped and removed'
end
end
docker_client = DockerKreuzbergClient.new
begin
docker_client.start_container
sleep(2)
content = docker_client.extract_file('document.pdf')
puts "Extracted content:\n#{content}"
ensure
docker_client.stop_container
end
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
puts "Content:"
puts result.content
puts "\nMetadata:"
puts "Title: #{result.metadata&.dig('title')}"
puts "Author: #{result.metadata&.dig('author')}"
puts "\nTables found: #{result.tables.length}"
puts "Images found: #{result.images.length}"
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
puts "Extracted #{result.content.length} characters"
puts "Quality score: #{result.quality_score}"
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
ocr_config = Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng'
)
config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts "Extracted text from scanned document:"
puts result.content
puts "Used OCR backend: tesseract"
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
puts "Extracted content:"
puts result.content[0...200]
```

View File

@@ -0,0 +1,9 @@
```ruby title="Ruby"
require 'kreuzberg'
puts "Kreuzberg version: #{Kreuzberg::VERSION}"
puts "FFI bindings loaded successfully"
result = Kreuzberg.extract_file_sync('sample.pdf')
puts "Installation verified! Extracted #{result.content.length} characters"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
content = result.content
tables = result.tables
images = result.images
metadata = result.metadata
puts "Content: #{content.length} characters"
puts "Tables: #{tables.length}"
puts "Images: #{images.length}"
puts "Metadata keys: #{metadata&.keys&.join(', ')}"
```

View File

@@ -0,0 +1,27 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'json'
schema = {
type: 'object',
properties: {
title: { type: 'string' },
authors: { type: 'array', items: { type: 'string' } },
date: { type: 'string' }
},
required: %w[title authors date],
additionalProperties: false
}
config = Kreuzberg::ExtractionConfig.new(
structured_extraction: Kreuzberg::StructuredExtractionConfig.new(
schema: JSON.generate(schema),
schema_name: 'PaperMetadata',
strict: true,
llm: Kreuzberg::LlmConfig.new(model: 'openai/gpt-4o-mini')
)
)
result = Kreuzberg.extract_file_sync('paper.pdf', config: config)
puts result.structured_output
```

View File

@@ -0,0 +1,21 @@
```ruby title="Ruby"
require 'json'
require 'open3'
Open3.popen3('kreuzberg', 'mcp') do |stdin, stdout, stderr, wait_thr|
request = {
method: 'tools/call',
params: {
name: 'extract_file',
arguments: { path: 'document.pdf', async: true }
}
}
stdin.puts JSON.generate(request)
stdin.close_write
response = stdout.gets
result = JSON.parse(response)
puts JSON.pretty_generate(result)
end
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'open3'
begin
Open3.popen3('kreuzberg', 'mcp') do |stdin, stdout, stderr, wait_thr|
puts stdout.read
wait_thr.join
end
rescue => e
puts "Failed to start MCP server: #{e.message}"
end
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.9,
detect_multiple: true
)
)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: true
)
)
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
puts "Detected languages: #{result.detected_languages}"
# Output: ['eng', 'fra', 'deu']
```

View File

@@ -0,0 +1,74 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata['page_count']
puts "Pages: #{metadata['page_count']}"
end
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['authors']
puts "Authors: #{metadata['authors'].join(', ')}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
metadata = html_result.metadata
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['description']
puts "Description: #{metadata['description']}"
end
# Access keywords as array
if metadata['keywords']
puts "Keywords: #{metadata['keywords'].join(', ')}"
end
# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
# Access Open Graph fields from map
open_graph = metadata['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
# Access Twitter Card fields from map
twitter_card = metadata['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
# Access new fields
puts "Language: #{metadata['language']}" if metadata['language']
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
# Access headers
if metadata['headers']
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
end
# Access links
if metadata['links']
metadata['links'].each do |link|
puts "Link: #{link['href']} (#{link['text']})"
end
end
# Access images
if metadata['images']
metadata['images'].each do |image|
puts "Image: #{image['src']}"
end
end
# Access structured data
if metadata['structured_data']
puts "Structured data items: #{metadata['structured_data'].length}"
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
if result.metadata.pages&.boundaries
content_bytes = result.content.bytes
result.metadata.pages.boundaries.take(3).each do |boundary|
page_bytes = content_bytes[boundary.byte_start...boundary.byte_end]
page_text = page_bytes.pack('C*').force_encoding('UTF-8')
puts "Page #{boundary.page_number}:"
puts " Byte range: #{boundary.byte_start}-#{boundary.byte_end}"
puts " Preview: #{page_text[0..100]}..."
end
end
```

View File

@@ -0,0 +1,16 @@
Require 'Kreuzberg'
Config = Kreuzberg::ExtractionConfig.new(
pages: Kreuzberg::PageConfig.new(
extract_pages: true
)
)
Result = Kreuzberg.extract_file_sync("document.pdf", config: config)
Result.pages&.each do |page|
puts "Page #{page.page_number}:"
puts " Content: #{page.content.length} chars"
puts " Tables: #{page.tables.length}"
puts " Images: #{page.images.length}"
end

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Iterate over tables
result.tables.each do |table|
puts "Table with #{table['cells'].length} rows"
puts table['markdown'] # Markdown representation
# Access cells
table['cells'].each do |row|
puts row
end
end
```

View File

@@ -0,0 +1,26 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true
)
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks.each_with_index do |chunk, i|
if chunk.embedding
puts "Chunk #{i}: #{chunk.embedding.length} dimensions"
# Store in vector database
end
end
```

View File

@@ -0,0 +1,32 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'net/http'
class CloudOcrBackend
def name
'cloud-ocr'
end
def supported_languages
%w[eng fra deu]
end
def process_image(image_data, language)
uri = URI('https://api.example.com/ocr')
req = Net::HTTP::Post.new(uri)
req['Authorization'] = "Bearer #{ENV['OCR_API_KEY']}"
req.body = image_data
res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |h| h.request(req) }
raise StandardError, res.message unless res.is_a?(Net::HTTPSuccess)
{ content: JSON.parse(res.body)['text'] }
rescue StandardError => e
raise StandardError, e.message
end
end
Kreuzberg.register_ocr_backend(CloudOcrBackend.new)
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(backend: 'cloud-ocr')
)
Kreuzberg.extract_file_sync('doc.pdf', config: config)
```

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
images: Kreuzberg::ImageExtractionConfig.new(
extract_images: true,
target_dpi: 200,
max_image_dimension: 2048,
inject_placeholders: true, # set to false to extract images without markdown references
auto_adjust_dpi: true
)
)
```

View File

@@ -0,0 +1,17 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
tesseract_config: Kreuzberg::TesseractConfig.new(
preprocessing: Kreuzberg::ImagePreprocessingConfig.new(
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: 'otsu'
)
)
)
)
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'easyocr',
language: 'eng'
)
)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts result.content[0..100]
puts "Total length: #{result.content.length}"
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'paddleocr',
language: 'eng'
)
)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
result.ocr_elements&.each do |element|
puts "Text: #{element.text}"
puts "Confidence: #{format('%.2f', element.confidence.recognition)}"
puts "Geometry: #{element.geometry}"
if element.rotation
puts "Rotation: #{element.rotation.angle}°"
end
puts
end
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'kreuzberg'
ocr_config = Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng'
)
config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts result.content
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
force_ocr: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts result.content
```

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng+deu+fra'
)
)
result = Kreuzberg.extract_file_sync('multilingual.pdf', config: config)
puts result.content
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'paddleocr',
language: 'eng'
# model_tier: 'server' # for max accuracy
)
)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts result.content[0..100]
puts "Total length: #{result.content.length}"
```

View File

@@ -0,0 +1,6 @@
```ruby title="Ruby"
require 'kreuzberg'
Kreuzberg.clear_post_processors
Kreuzberg.clear_validators
```

View File

@@ -0,0 +1,44 @@
```ruby title="Ruby"
require 'kreuzberg'
# Wrap an already-loaded embedder so kreuzberg can call back into it during
# chunking and standalone embed requests. The Ruby object must respond to
# `dimensions` and `embed`; `version`, `initialize`, and `shutdown` are
# optional lifecycle hooks.
class MyEmbedder
def version
'1.0.0'
end
def initialize_plugin
# Optional warm-up; runs once at registration.
end
def shutdown
# Optional cleanup.
end
# Captured once at registration; the dispatcher uses this for shape validation.
def dimensions
768
end
def embed(texts)
# Delegate to the already-loaded host model.
texts.map { Array.new(768, 0.0) }
end
end
# Register once at startup. The second argument is the plugin name used to
# reference the backend from EmbeddingConfig.
Kreuzberg.register_embedding_backend(MyEmbedder.new, 'my-embedder')
config = Kreuzberg::EmbeddingConfig.new(
model: { type: 'plugin', name: 'my-embedder' },
# Optional: bound the wait on a hung backend (default 60s; nil disables).
max_embed_duration_secs: 30
)
vectors = Kreuzberg.embed_texts(['Hello, world!', 'Second text'], config: config)
puts "Generated #{vectors.length} vectors"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
# Register custom extractor with priority 50
Kreuzberg.register_document_extractor(
name: "custom-json-extractor",
extractor: ->(content, mime_type, config) {
JSON.parse(content.to_s)
},
priority: 50
)
result = Kreuzberg.extract_file("document.json")
puts "Extracted content length: #{result.content.length}"
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
processors = Kreuzberg.list_post_processors
validators = Kreuzberg.list_validators
backends = Kreuzberg.list_ocr_backends
puts "Post-processors: #{processors.inspect}"
puts "Validators: #{validators.inspect}"
puts "OCR backends: #{backends.inspect}"
```

View File

@@ -0,0 +1,32 @@
```ruby title="Ruby"
require "kreuzberg"
# Custom validator ensuring extracted content meets minimum length requirement
class MinLengthValidator
def initialize(min_length: 100)
@min_length = min_length
end
def call(result)
content_length = result["content"].length
if content_length < @min_length
raise StandardError,
"Content too short: #{content_length} < #{@min_length}"
end
end
end
# Register the validator with priority
validator = MinLengthValidator.new(min_length: 100)
Kreuzberg.register_validator("min_length_validator", validator, 100)
# Usage in extraction
config = Kreuzberg::ExtractionConfig.new
begin
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
puts "Extraction successful: #{result["content"].length} characters"
rescue StandardError => e
puts "Validation failed: #{e.message}"
end
```

View File

@@ -0,0 +1,27 @@
```ruby title="Ruby"
require 'kreuzberg'
class PdfMetadataExtractor
def initialize
@count = 0
end
def call(result)
return result unless result['mime_type'] == 'application/pdf'
@count += 1
result['metadata'] ||= {}
result['metadata']['pdf_order'] = @count
result
end
end
extractor = PdfMetadataExtractor.new
Kreuzberg.register_post_processor('pdf_metadata', extractor)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('report.pdf', config: config)
puts "Metadata: #{result.metadata.inspect}"
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
class PdfOnlyProcessor
def call(result)
return result unless result['mime_type'] == 'application/pdf'
result['metadata'] ||= {}
result['metadata']['pdf_processed'] = true
result
end
end
processor = PdfOnlyProcessor.new
Kreuzberg.register_post_processor('pdf_only', processor)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Metadata: #{result.metadata.inspect}"
```

View File

@@ -0,0 +1,23 @@
```ruby title="Ruby"
require 'kreuzberg'
class CustomPostProcessor
def call(result)
result['metadata'] ||= {}
result['metadata']['processed_by'] = 'CustomPostProcessor'
result
end
end
class CustomValidator
def call(result)
raise StandardError, 'Empty' if result['content'].empty?
end
end
processor = CustomPostProcessor.new
validator = CustomValidator.new
Kreuzberg.register_post_processor('custom', processor)
Kreuzberg.register_validator('custom', validator)
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'logger'
logger = Logger.new($stdout)
class LoggingPostProcessor
def call(result)
puts "Processing: #{result['mime_type']}"
puts "Content: #{result['content'].length} bytes"
result
end
end
class LoggingValidator
def call(result)
puts "Validating: #{result['content'].length} bytes"
raise StandardError, 'Too short' if result['content'].length < 50
end
end
processor = LoggingPostProcessor.new
validator = LoggingValidator.new
Kreuzberg.register_post_processor('logging-proc', processor)
Kreuzberg.register_validator('logging-val', validator)
logger.info('Plugins registered')
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'rspec'
describe 'Plugin Testing' do
it 'registers and calls post-processor' do
processor = ->(result) { result['metadata'] ||= {}; result }
Kreuzberg.register_post_processor('test', processor)
expect(Kreuzberg.list_post_processors).to include('test')
Kreuzberg.unregister_post_processor('test')
end
it 'registers and validates' do
validator = ->(result) do
raise StandardError, 'Too short' if result['content'].length < 10
end
Kreuzberg.register_validator('test-val', validator)
expect(Kreuzberg.list_validators).to include('test-val')
Kreuzberg.unregister_validator('test-val')
end
end
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require "kreuzberg"
validator = lambda do |result|
raise StandardError, "Content too short" if result.content.length < 50
end
Kreuzberg.register_validator("min_length", validator, priority: 10)
result = Kreuzberg.extract_file_sync("document.pdf")
puts "Validated content length: #{result.content.length}"
Kreuzberg.unregister_validator("min_length")
```

View File

@@ -0,0 +1,35 @@
```ruby title="Ruby"
require "kreuzberg"
# Custom validator checking document quality score
class QualityScoreValidator
def initialize(min_score: 0.5)
@min_score = min_score
end
def call(result)
quality_score = result.quality_score || 0.0
if quality_score < @min_score
raise StandardError,
format("Quality score too low: %.2f < %.2f", quality_score, @min_score)
end
end
end
# Register with default minimum score of 0.5
validator = QualityScoreValidator.new(min_score: 0.5)
Kreuzberg.register_validator("quality_score_check", validator)
# Usage with quality processing enabled
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
begin
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
puts "Document quality verified: #{result.quality_score}"
rescue StandardError => e
puts "Quality check failed: #{e.message}"
end
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
class StatefulPlugin
def initialize
@lock = Mutex.new
@count = 0
end
def call(result)
@lock.synchronize { @count += 1 }
result
end
def count
@lock.synchronize { @count }
end
end
plugin = StatefulPlugin.new
Kreuzberg.register_post_processor('counter', plugin)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Processed: #{plugin.count}"
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
Kreuzberg.unregister_post_processor('word_count')
Kreuzberg.unregister_validator('min_length_validator')
Kreuzberg.unregister_ocr_backend('cloud-ocr')
```

View File

@@ -0,0 +1,23 @@
```ruby title="Ruby"
require 'kreuzberg'
class WordCountProcessor
def call(result)
return result if result['content'].empty?
word_count = result['content'].split.length
result['metadata'] ||= {}
result['metadata']['word_count'] = word_count
result
end
end
processor = WordCountProcessor.new
Kreuzberg.register_post_processor('word_count', processor, 10)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Words: #{result.metadata['word_count']}"
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1500,
overlap: 200,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'text-embedding-all-minilm-l6-v2'
)
)
)
)
```

View File

@@ -0,0 +1,26 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 500,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true
)
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
result.chunks.each_with_index do |chunk, i|
puts "Chunk #{i + 1}/#{result.chunks.length}"
puts "Position: #{chunk.metadata[:byte_start]}-#{chunk.metadata[:byte_end]}"
puts "Content: #{chunk.content[0..99]}..."
puts "Embedding: #{chunk.embedding.length} dimensions" if chunk.embedding
end
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1024,
overlap: 100,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32,
show_download_progress: false
)
)
)
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.extracted_keywords
keywords.each do |kw|
puts "#{kw['text']}: #{kw['score'].round(3)}"
end
```

View File

@@ -0,0 +1,17 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
quality_score = result.quality_score
if quality_score < 0.5
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
else
puts "Quality score: #{quality_score.round(2)}"
end
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require "kreuzberg"
config = { model: { type: "preset", name: "balanced" }, normalize: true }
texts = ["Hello, world!", "Kreuzberg is fast"]
# Synchronous
embeddings = Kreuzberg.embed_sync(texts: texts, config: config)
puts embeddings.length # 2
puts embeddings[0].length # 768
# Async variant (uses same thread, returns when done)
embeddings = Kreuzberg.embed(texts: texts, config: config)
puts embeddings[0].length # 768
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_important_words: true
)
)
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true
)
)
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
# Check reduction statistics in metadata
original_tokens = result.metadata['original_token_count']
reduced_tokens = result.metadata['token_count']
reduction_ratio = result.metadata['token_reduction_ratio']
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
puts "Reduction: #{reduction_ratio * 100}%"
```