This commit is contained in:
11
docs/snippets/ruby/metadata/language_detection.md
Normal file
11
docs/snippets/ruby/metadata/language_detection.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.9,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
|
||||
|
||||
puts "Detected languages: #{result.detected_languages}"
|
||||
# Output: ['eng', 'fra', 'deu']
|
||||
```
|
||||
74
docs/snippets/ruby/metadata/metadata.md
Normal file
74
docs/snippets/ruby/metadata/metadata.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
if metadata['page_count']
|
||||
puts "Pages: #{metadata['page_count']}"
|
||||
end
|
||||
if metadata['title']
|
||||
puts "Title: #{metadata['title']}"
|
||||
end
|
||||
if metadata['authors']
|
||||
puts "Authors: #{metadata['authors'].join(', ')}"
|
||||
end
|
||||
|
||||
# Access HTML metadata
|
||||
html_result = Kreuzberg.extract_file_sync('page.html')
|
||||
metadata = html_result.metadata
|
||||
if metadata['title']
|
||||
puts "Title: #{metadata['title']}"
|
||||
end
|
||||
if metadata['description']
|
||||
puts "Description: #{metadata['description']}"
|
||||
end
|
||||
|
||||
# Access keywords as array
|
||||
if metadata['keywords']
|
||||
puts "Keywords: #{metadata['keywords'].join(', ')}"
|
||||
end
|
||||
|
||||
# Access canonical URL (renamed from canonical)
|
||||
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
|
||||
|
||||
# Access Open Graph fields from map
|
||||
open_graph = metadata['open_graph'] || {}
|
||||
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
|
||||
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
|
||||
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
|
||||
|
||||
# Access Twitter Card fields from map
|
||||
twitter_card = metadata['twitter_card'] || {}
|
||||
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
|
||||
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
|
||||
|
||||
# Access new fields
|
||||
puts "Language: #{metadata['language']}" if metadata['language']
|
||||
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
|
||||
|
||||
# Access headers
|
||||
if metadata['headers']
|
||||
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
|
||||
end
|
||||
|
||||
# Access links
|
||||
if metadata['links']
|
||||
metadata['links'].each do |link|
|
||||
puts "Link: #{link['href']} (#{link['text']})"
|
||||
end
|
||||
end
|
||||
|
||||
# Access images
|
||||
if metadata['images']
|
||||
metadata['images'].each do |image|
|
||||
puts "Image: #{image['src']}"
|
||||
end
|
||||
end
|
||||
|
||||
# Access structured data
|
||||
if metadata['structured_data']
|
||||
puts "Structured data items: #{metadata['structured_data'].length}"
|
||||
end
|
||||
```
|
||||
18
docs/snippets/ruby/metadata/page_boundaries.md
Normal file
18
docs/snippets/ruby/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
if result.metadata.pages&.boundaries
|
||||
content_bytes = result.content.bytes
|
||||
|
||||
result.metadata.pages.boundaries.take(3).each do |boundary|
|
||||
page_bytes = content_bytes[boundary.byte_start...boundary.byte_end]
|
||||
page_text = page_bytes.pack('C*').force_encoding('UTF-8')
|
||||
|
||||
puts "Page #{boundary.page_number}:"
|
||||
puts " Byte range: #{boundary.byte_start}-#{boundary.byte_end}"
|
||||
puts " Preview: #{page_text[0..100]}..."
|
||||
end
|
||||
end
|
||||
```
|
||||
16
docs/snippets/ruby/metadata/page_tracking_basic.md
Normal file
16
docs/snippets/ruby/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,16 @@
|
||||
Require 'Kreuzberg'
|
||||
|
||||
Config = Kreuzberg::ExtractionConfig.new(
|
||||
pages: Kreuzberg::PageConfig.new(
|
||||
extract_pages: true
|
||||
)
|
||||
)
|
||||
|
||||
Result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
|
||||
Result.pages&.each do |page|
|
||||
puts "Page #{page.page_number}:"
|
||||
puts " Content: #{page.content.length} chars"
|
||||
puts " Tables: #{page.tables.length}"
|
||||
puts " Images: #{page.images.length}"
|
||||
end
|
||||
16
docs/snippets/ruby/metadata/tables.md
Normal file
16
docs/snippets/ruby/metadata/tables.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
# Iterate over tables
|
||||
result.tables.each do |table|
|
||||
puts "Table with #{table['cells'].length} rows"
|
||||
puts table['markdown'] # Markdown representation
|
||||
|
||||
# Access cells
|
||||
table['cells'].each do |row|
|
||||
puts row
|
||||
end
|
||||
end
|
||||
```
|
||||
26
docs/snippets/ruby/metadata/vector_database_integration.md
Normal file
26
docs/snippets/ruby/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks.each_with_index do |chunk, i|
|
||||
if chunk.embedding
|
||||
puts "Chunk #{i}: #{chunk.embedding.length} dimensions"
|
||||
# Store in vector database
|
||||
end
|
||||
end
|
||||
```
|
||||
Reference in New Issue
Block a user