Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.9,
detect_multiple: true
)
)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: true
)
)
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
puts "Detected languages: #{result.detected_languages}"
# Output: ['eng', 'fra', 'deu']
```

View File

@@ -0,0 +1,74 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata['page_count']
puts "Pages: #{metadata['page_count']}"
end
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['authors']
puts "Authors: #{metadata['authors'].join(', ')}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
metadata = html_result.metadata
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['description']
puts "Description: #{metadata['description']}"
end
# Access keywords as array
if metadata['keywords']
puts "Keywords: #{metadata['keywords'].join(', ')}"
end
# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
# Access Open Graph fields from map
open_graph = metadata['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
# Access Twitter Card fields from map
twitter_card = metadata['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
# Access new fields
puts "Language: #{metadata['language']}" if metadata['language']
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
# Access headers
if metadata['headers']
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
end
# Access links
if metadata['links']
metadata['links'].each do |link|
puts "Link: #{link['href']} (#{link['text']})"
end
end
# Access images
if metadata['images']
metadata['images'].each do |image|
puts "Image: #{image['src']}"
end
end
# Access structured data
if metadata['structured_data']
puts "Structured data items: #{metadata['structured_data'].length}"
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
if result.metadata.pages&.boundaries
content_bytes = result.content.bytes
result.metadata.pages.boundaries.take(3).each do |boundary|
page_bytes = content_bytes[boundary.byte_start...boundary.byte_end]
page_text = page_bytes.pack('C*').force_encoding('UTF-8')
puts "Page #{boundary.page_number}:"
puts " Byte range: #{boundary.byte_start}-#{boundary.byte_end}"
puts " Preview: #{page_text[0..100]}..."
end
end
```

View File

@@ -0,0 +1,16 @@
Require 'Kreuzberg'
Config = Kreuzberg::ExtractionConfig.new(
pages: Kreuzberg::PageConfig.new(
extract_pages: true
)
)
Result = Kreuzberg.extract_file_sync("document.pdf", config: config)
Result.pages&.each do |page|
puts "Page #{page.page_number}:"
puts " Content: #{page.content.length} chars"
puts " Tables: #{page.tables.length}"
puts " Images: #{page.images.length}"
end

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Iterate over tables
result.tables.each do |table|
puts "Table with #{table['cells'].length} rows"
puts table['markdown'] # Markdown representation
# Access cells
table['cells'].each do |row|
puts row
end
end
```

View File

@@ -0,0 +1,26 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true
)
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks.each_with_index do |chunk, i|
if chunk.embedding
puts "Chunk #{i}: #{chunk.embedding.length} dimensions"
# Store in vector database
end
end
```