Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
items = [
Kreuzberg::BatchBytesItem.new(
content: File.read('doc1.pdf'),
mime_type: 'application/pdf'
),
Kreuzberg::BatchBytesItem.new(
content: File.read('doc2.docx'),
mime_type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
),
Kreuzberg::BatchBytesItem.new(
content: File.read('doc3.xlsx'),
mime_type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
]
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
results = Kreuzberg.batch_extract_bytes_sync(items, config: config)
results.each { |result| puts "Extracted: #{result.content.length} chars" }
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
items = [
Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
]
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
results = Kreuzberg.batch_extract_files_sync(items, config: config)
results.each_with_index do |result, idx|
puts "Document #{idx + 1}:"
puts " Extracted: #{result.content.length} characters"
puts " Quality: #{result.quality_score}"
puts " MIME: #{result.mime_type}"
end
```

View File

@@ -0,0 +1,36 @@
```ruby title="Ruby"
require 'net/http'
require 'json'
uri = URI('http://localhost:8000/chunk')
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri)
request['Content-Type'] = 'application/json'
payload = {
text: 'Your long text content here...',
chunker_type: 'text',
config: {
max_characters: 1000,
overlap: 50,
trim: true
}
}
request.body = JSON.generate(payload)
response = http.request(request)
if response.is_a?(Net::HTTPSuccess)
result = JSON.parse(response.body)
puts "Created #{result['chunk_count']} chunks"
result['chunks'].each do |chunk|
preview = chunk['content'][0..49]
puts "Chunk #{chunk['chunk_index']}: #{preview}..."
end
else
puts "Error: #{response.code} #{response.message}"
end
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'net/http'
require 'json'
uri = URI('http://localhost:8000/extract')
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri)
File.open('document.pdf', 'rb') do |file|
body = file.read
request['Content-Type'] = 'application/octet-stream'
request.body = body
response = http.request(request)
if response.is_a?(Net::HTTPSuccess)
data = JSON.parse(response.body)
puts JSON.pretty_generate(data)
else
puts "Error: #{response.code} #{response.message}"
end
end
```

View File

@@ -0,0 +1,45 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true,
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
detect_multiple: true,
min_confidence: 0.8
),
token_reduction: Kreuzberg::TokenReductionOptions.new(
mode: 'moderate',
preserve_important_words: true
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: { type: 'preset', name: 'text-embedding-all-minilm-l6-v2' }
)
),
keywords: Kreuzberg::KeywordConfig.new(
algorithm: 'yake',
max_keywords: 10
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Content length: #{result.content.length} characters"
puts "Quality score: #{result.quality_score}"
puts "Detected languages: #{result.detected_languages&.join(', ')}"
puts "Total chunks: #{result.chunks&.length || 0}"
puts "Keywords: #{result.extracted_keywords&.map(&:text)&.join(', ')}"
if result.chunks && result.chunks.length > 0
first_chunk = result.chunks[0]
puts "First chunk size: #{first_chunk.content.length} chars"
puts "Embedding dims: #{first_chunk.embedding&.length || 0}"
end
```

View File

@@ -0,0 +1,21 @@
```ruby title="Ruby"
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('missing.pdf')
puts result.content
rescue RuntimeError => e
# All extraction errors are raised as RuntimeError
# Check error message for specific error details
case e.message
when /validation/i
puts "Validation error: #{e.message}"
when /io|not found/i
puts "IO error: #{e.message}"
raise
else
puts "Extraction failed: #{e.message}"
raise
end
end
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
begin
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new
result = Kreuzberg.extract_bytes_sync(pdf_bytes, 'application/pdf', config: config)
puts "Extracted #{result.content.length} characters"
rescue RuntimeError => e
# All extraction errors are raised as RuntimeError
# Check error message for details
case e.message
when /parse|parsing/i
puts "Failed to parse document: #{e.message}"
when /ocr/i
puts "OCR processing failed: #{e.message}"
when /validation|invalid/i
puts "Invalid configuration: #{e.message}"
else
puts "Extraction error: #{e.message}"
end
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_bytes_async(
pdf_bytes,
'application/pdf',
config: config
)
puts "Async bytes extraction done"
puts "Content preview: #{result.content[0..100]}"
puts "Quality score: #{result.quality_score}"
```

View File

@@ -0,0 +1,17 @@
```ruby title="Ruby"
require 'kreuzberg'
pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
use_cache: true
)
result = Kreuzberg.extract_bytes_sync(
pdf_bytes,
'application/pdf',
config: config
)
puts "Extracted #{result.content.length} characters"
puts "Detected MIME: #{result.mime_type}"
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: false,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_async('document.pdf', config: config)
puts "Async extraction complete"
puts "Extracted #{result.content.length} characters"
puts "Quality: #{result.quality_score}"
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Extracted #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
puts "Quality score: #{result.quality_score}"
```