This commit is contained in:
18
docs/snippets/ruby/advanced/chunk_page_mapping.md
Normal file
18
docs/snippets/ruby/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(max_characters: 500, overlap: 50),
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks&.each do |chunk|
|
||||
first = chunk.metadata.first_page
|
||||
last = chunk.metadata.last_page
|
||||
next if first.nil?
|
||||
|
||||
page_range = first == last ? "Page #{first}" : "Pages #{first}-#{last}"
|
||||
puts "Chunk: #{chunk.content[0..50]}... (#{page_range})"
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/advanced/chunking_config.md
Normal file
31
docs/snippets/ruby/advanced/chunking_config.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-minilm-l6-v2'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Prepend Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
prepend_heading_context: true
|
||||
)
|
||||
)
|
||||
```
|
||||
45
docs/snippets/ruby/advanced/chunking_rag.md
Normal file
45
docs/snippets/ruby/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-mpnet-base-v2'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 16
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
vector_store = build_vector_store(result.chunks)
|
||||
query = 'machine learning optimization'
|
||||
relevant_chunks = search_vector_store(vector_store, query)
|
||||
|
||||
puts "Found #{relevant_chunks.length} relevant chunks"
|
||||
relevant_chunks.take(3).each do |chunk|
|
||||
puts "Content: #{chunk[:content][0..80]}..."
|
||||
puts "Similarity: #{chunk[:similarity]&.round(3)}\n"
|
||||
end
|
||||
|
||||
def build_vector_store(chunks)
|
||||
chunks.map.with_index do |chunk, idx|
|
||||
{
|
||||
id: idx,
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
similarity: 0.0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def search_vector_store(store, query)
|
||||
store.sort_by { |entry| entry[:similarity] }.reverse
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/advanced/embedding_with_chunking.md
Normal file
31
docs/snippets/ruby/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
chunks = result.chunks || []
|
||||
chunks.each_with_index do |chunk, idx|
|
||||
chunk_id = "doc_chunk_#{idx}"
|
||||
puts "Chunk #{chunk_id}: #{chunk.content[0...50]}"
|
||||
|
||||
if chunk.embedding
|
||||
puts " Embedding dimensions: #{chunk.embedding.length}"
|
||||
end
|
||||
end
|
||||
```
|
||||
13
docs/snippets/ruby/advanced/keyword_extraction_config.md
Normal file
13
docs/snippets/ruby/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en'
|
||||
)
|
||||
)
|
||||
```
|
||||
20
docs/snippets/ruby/advanced/keyword_extraction_example.md
Normal file
20
docs/snippets/ruby/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
keywords = result.metadata&.dig('keywords') || []
|
||||
keywords.each do |kw|
|
||||
text = kw['text']
|
||||
score = kw['score']
|
||||
puts "#{text}: #{score.round(3)}"
|
||||
end
|
||||
```
|
||||
21
docs/snippets/ruby/advanced/language_detection_config.md
Normal file
21
docs/snippets/ruby/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
if result.detected_languages&.any?
|
||||
puts "Detected Language: #{result.detected_languages.first}"
|
||||
else
|
||||
puts "No language detected"
|
||||
end
|
||||
|
||||
puts "Content length: #{result.content.length} characters"
|
||||
```
|
||||
@@ -0,0 +1,24 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
|
||||
|
||||
languages = result.detected_languages || []
|
||||
|
||||
if languages.any?
|
||||
puts "Detected #{languages.length} language(s): #{languages.join(', ')}"
|
||||
else
|
||||
puts "No languages detected"
|
||||
end
|
||||
|
||||
puts "Total content: #{result.content.length} characters"
|
||||
puts "MIME type: #{result.mime_type}"
|
||||
```
|
||||
7
docs/snippets/ruby/advanced/quality_processing_config.md
Normal file
7
docs/snippets/ruby/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
```
|
||||
18
docs/snippets/ruby/advanced/quality_processing_example.md
Normal file
18
docs/snippets/ruby/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
|
||||
|
||||
quality_score = result.quality_score || 0.0
|
||||
|
||||
if quality_score < 0.5
|
||||
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
|
||||
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
else
|
||||
puts "Quality score: #{quality_score.round(2)}"
|
||||
end
|
||||
```
|
||||
12
docs/snippets/ruby/advanced/token_reduction_config.md
Normal file
12
docs/snippets/ruby/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: 'eng'
|
||||
)
|
||||
)
|
||||
```
|
||||
19
docs/snippets/ruby/advanced/token_reduction_example.md
Normal file
19
docs/snippets/ruby/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
|
||||
|
||||
original_tokens = result.metadata&.dig('original_token_count') || 0
|
||||
reduced_tokens = result.metadata&.dig('token_count') || 0
|
||||
reduction_ratio = result.metadata&.dig('token_reduction_ratio') || 0.0
|
||||
|
||||
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
|
||||
puts "Reduction: #{(reduction_ratio * 100).round(1)}%"
|
||||
```
|
||||
53
docs/snippets/ruby/advanced/vector_database_integration.md
Normal file
53
docs/snippets/ruby/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class VectorDatabaseIntegration
|
||||
VectorRecord = Struct.new(:id, :embedding, :content, :metadata, keyword_init: true)
|
||||
|
||||
def extract_and_vectorize(document_path, document_id)
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync(document_path, config: config)
|
||||
chunks = result.chunks || []
|
||||
|
||||
vector_records = chunks.map.with_index do |chunk, idx|
|
||||
VectorRecord.new(
|
||||
id: "#{document_id}_chunk_#{idx}",
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
metadata: {
|
||||
document_id: document_id,
|
||||
chunk_index: idx,
|
||||
content_length: chunk.content.length
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
store_in_vector_database(vector_records)
|
||||
vector_records
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def store_in_vector_database(records)
|
||||
records.each do |record|
|
||||
if record.embedding&.any?
|
||||
puts "Storing #{record.id}: #{record.content.length} chars, #{record.embedding.length} dims"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
24
docs/snippets/ruby/api/batch_extract_bytes_sync.md
Normal file
24
docs/snippets/ruby/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
items = [
|
||||
Kreuzberg::BatchBytesItem.new(
|
||||
content: File.read('doc1.pdf'),
|
||||
mime_type: 'application/pdf'
|
||||
),
|
||||
Kreuzberg::BatchBytesItem.new(
|
||||
content: File.read('doc2.docx'),
|
||||
mime_type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
||||
),
|
||||
Kreuzberg::BatchBytesItem.new(
|
||||
content: File.read('doc3.xlsx'),
|
||||
mime_type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
||||
)
|
||||
]
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
|
||||
|
||||
results = Kreuzberg.batch_extract_bytes_sync(items, config: config)
|
||||
|
||||
results.each { |result| puts "Extracted: #{result.content.length} chars" }
|
||||
```
|
||||
20
docs/snippets/ruby/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/ruby/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
items = [
|
||||
Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
|
||||
Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
|
||||
Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
|
||||
]
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
|
||||
|
||||
results = Kreuzberg.batch_extract_files_sync(items, config: config)
|
||||
|
||||
results.each_with_index do |result, idx|
|
||||
puts "Document #{idx + 1}:"
|
||||
puts " Extracted: #{result.content.length} characters"
|
||||
puts " Quality: #{result.quality_score}"
|
||||
puts " MIME: #{result.mime_type}"
|
||||
end
|
||||
```
|
||||
36
docs/snippets/ruby/api/client_chunk_text.md
Normal file
36
docs/snippets/ruby/api/client_chunk_text.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```ruby title="Ruby"
|
||||
require 'net/http'
|
||||
require 'json'
|
||||
|
||||
uri = URI('http://localhost:8000/chunk')
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
|
||||
request = Net::HTTP::Post.new(uri)
|
||||
request['Content-Type'] = 'application/json'
|
||||
|
||||
payload = {
|
||||
text: 'Your long text content here...',
|
||||
chunker_type: 'text',
|
||||
config: {
|
||||
max_characters: 1000,
|
||||
overlap: 50,
|
||||
trim: true
|
||||
}
|
||||
}
|
||||
|
||||
request.body = JSON.generate(payload)
|
||||
|
||||
response = http.request(request)
|
||||
|
||||
if response.is_a?(Net::HTTPSuccess)
|
||||
result = JSON.parse(response.body)
|
||||
puts "Created #{result['chunk_count']} chunks"
|
||||
|
||||
result['chunks'].each do |chunk|
|
||||
preview = chunk['content'][0..49]
|
||||
puts "Chunk #{chunk['chunk_index']}: #{preview}..."
|
||||
end
|
||||
else
|
||||
puts "Error: #{response.code} #{response.message}"
|
||||
end
|
||||
```
|
||||
24
docs/snippets/ruby/api/client_extract_single_file.md
Normal file
24
docs/snippets/ruby/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```ruby title="Ruby"
|
||||
require 'net/http'
|
||||
require 'json'
|
||||
|
||||
uri = URI('http://localhost:8000/extract')
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
|
||||
request = Net::HTTP::Post.new(uri)
|
||||
|
||||
File.open('document.pdf', 'rb') do |file|
|
||||
body = file.read
|
||||
request['Content-Type'] = 'application/octet-stream'
|
||||
request.body = body
|
||||
|
||||
response = http.request(request)
|
||||
|
||||
if response.is_a?(Net::HTTPSuccess)
|
||||
data = JSON.parse(response.body)
|
||||
puts JSON.pretty_generate(data)
|
||||
else
|
||||
puts "Error: #{response.code} #{response.message}"
|
||||
end
|
||||
end
|
||||
```
|
||||
45
docs/snippets/ruby/api/combining_all_features.md
Normal file
45
docs/snippets/ruby/api/combining_all_features.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true,
|
||||
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
detect_multiple: true,
|
||||
min_confidence: 0.8
|
||||
),
|
||||
|
||||
token_reduction: Kreuzberg::TokenReductionOptions.new(
|
||||
mode: 'moderate',
|
||||
preserve_important_words: true
|
||||
),
|
||||
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: 'preset', name: 'text-embedding-all-minilm-l6-v2' }
|
||||
)
|
||||
),
|
||||
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: 'yake',
|
||||
max_keywords: 10
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
puts "Content length: #{result.content.length} characters"
|
||||
puts "Quality score: #{result.quality_score}"
|
||||
puts "Detected languages: #{result.detected_languages&.join(', ')}"
|
||||
puts "Total chunks: #{result.chunks&.length || 0}"
|
||||
puts "Keywords: #{result.extracted_keywords&.map(&:text)&.join(', ')}"
|
||||
|
||||
if result.chunks && result.chunks.length > 0
|
||||
first_chunk = result.chunks[0]
|
||||
puts "First chunk size: #{first_chunk.content.length} chars"
|
||||
puts "Embedding dims: #{first_chunk.embedding&.length || 0}"
|
||||
end
|
||||
```
|
||||
21
docs/snippets/ruby/api/error_handling.md
Normal file
21
docs/snippets/ruby/api/error_handling.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
begin
|
||||
result = Kreuzberg.extract_file_sync('missing.pdf')
|
||||
puts result.content
|
||||
rescue RuntimeError => e
|
||||
# All extraction errors are raised as RuntimeError
|
||||
# Check error message for specific error details
|
||||
case e.message
|
||||
when /validation/i
|
||||
puts "Validation error: #{e.message}"
|
||||
when /io|not found/i
|
||||
puts "IO error: #{e.message}"
|
||||
raise
|
||||
else
|
||||
puts "Extraction failed: #{e.message}"
|
||||
raise
|
||||
end
|
||||
end
|
||||
```
|
||||
24
docs/snippets/ruby/api/error_handling_extract.md
Normal file
24
docs/snippets/ruby/api/error_handling_extract.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
begin
|
||||
pdf_bytes = File.read('document.pdf')
|
||||
config = Kreuzberg::ExtractionConfig.new
|
||||
|
||||
result = Kreuzberg.extract_bytes_sync(pdf_bytes, 'application/pdf', config: config)
|
||||
puts "Extracted #{result.content.length} characters"
|
||||
rescue RuntimeError => e
|
||||
# All extraction errors are raised as RuntimeError
|
||||
# Check error message for details
|
||||
case e.message
|
||||
when /parse|parsing/i
|
||||
puts "Failed to parse document: #{e.message}"
|
||||
when /ocr/i
|
||||
puts "OCR processing failed: #{e.message}"
|
||||
when /validation|invalid/i
|
||||
puts "Invalid configuration: #{e.message}"
|
||||
else
|
||||
puts "Extraction error: #{e.message}"
|
||||
end
|
||||
end
|
||||
```
|
||||
18
docs/snippets/ruby/api/extract_bytes_async.md
Normal file
18
docs/snippets/ruby/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
pdf_bytes = File.read('document.pdf')
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_bytes_async(
|
||||
pdf_bytes,
|
||||
'application/pdf',
|
||||
config: config
|
||||
)
|
||||
|
||||
puts "Async bytes extraction done"
|
||||
puts "Content preview: #{result.content[0..100]}"
|
||||
puts "Quality score: #{result.quality_score}"
|
||||
```
|
||||
17
docs/snippets/ruby/api/extract_bytes_sync.md
Normal file
17
docs/snippets/ruby/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
pdf_bytes = File.read('document.pdf')
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_bytes_sync(
|
||||
pdf_bytes,
|
||||
'application/pdf',
|
||||
config: config
|
||||
)
|
||||
|
||||
puts "Extracted #{result.content.length} characters"
|
||||
puts "Detected MIME: #{result.mime_type}"
|
||||
```
|
||||
14
docs/snippets/ruby/api/extract_file_async.md
Normal file
14
docs/snippets/ruby/api/extract_file_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: false,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_async('document.pdf', config: config)
|
||||
|
||||
puts "Async extraction complete"
|
||||
puts "Extracted #{result.content.length} characters"
|
||||
puts "Quality: #{result.quality_score}"
|
||||
```
|
||||
14
docs/snippets/ruby/api/extract_file_sync.md
Normal file
14
docs/snippets/ruby/api/extract_file_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
puts "Extracted #{result.content.length} characters"
|
||||
puts "MIME type: #{result.mime_type}"
|
||||
puts "Quality score: #{result.quality_score}"
|
||||
```
|
||||
46
docs/snippets/ruby/benchmarking/simple_benchmark.rb
Normal file
46
docs/snippets/ruby/benchmarking/simple_benchmark.rb
Normal file
@@ -0,0 +1,46 @@
|
||||
```ruby title="simple_benchmark.rb"
|
||||
require 'kreuzberg'
|
||||
require 'benchmark'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(use_cache: false)
|
||||
kreuzberg = Kreuzberg::Client.new(config)
|
||||
file_path = 'document.pdf'
|
||||
num_runs = 10
|
||||
|
||||
puts "Sync extraction (#{num_runs} runs):"
|
||||
sync_time = Benchmark.realtime do
|
||||
num_runs.times do
|
||||
kreuzberg.extract_file(file_path)
|
||||
end
|
||||
end
|
||||
avg_sync = sync_time / num_runs
|
||||
puts " - Total time: #{sync_time.round(3)}s"
|
||||
puts " - Average: #{avg_sync.round(3)}s per extraction"
|
||||
|
||||
puts "\nAsync extraction (#{num_runs} parallel runs):"
|
||||
async_time = Benchmark.realtime do
|
||||
threads = num_runs.times.map do
|
||||
Thread.new { kreuzberg.extract_file(file_path) }
|
||||
end
|
||||
threads.map(&:join)
|
||||
end
|
||||
puts " - Total time: #{async_time.round(3)}s"
|
||||
puts " - Average: #{(async_time / num_runs).round(3)}s per extraction"
|
||||
puts " - Speedup: #{(sync_time / async_time).round(1)}x"
|
||||
|
||||
cache_config = Kreuzberg::ExtractionConfig.new(use_cache: true)
|
||||
kreuzberg_cached = Kreuzberg::Client.new(cache_config)
|
||||
|
||||
puts "\nFirst extraction (populates cache)..."
|
||||
first_time = Benchmark.realtime do
|
||||
kreuzberg_cached.extract_file(file_path)
|
||||
end
|
||||
puts " - Time: #{first_time.round(3)}s"
|
||||
|
||||
puts "Second extraction (from cache)..."
|
||||
cached_time = Benchmark.realtime do
|
||||
kreuzberg_cached.extract_file(file_path)
|
||||
end
|
||||
puts " - Time: #{cached_time.round(3)}s"
|
||||
puts " - Cache speedup: #{(first_time / cached_time).round(1)}x"
|
||||
```
|
||||
37
docs/snippets/ruby/cache/disk_cache.rb
vendored
Normal file
37
docs/snippets/ruby/cache/disk_cache.rb
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
```ruby title="disk_cache.rb"
|
||||
require 'kreuzberg'
|
||||
require 'fileutils'
|
||||
|
||||
cache_dir = File.expand_path('~/.cache/kreuzberg')
|
||||
FileUtils.mkdir_p(cache_dir)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
cache_config: Kreuzberg::CacheConfig.new(
|
||||
cache_path: cache_dir,
|
||||
max_cache_size: 500 * 1024 * 1024,
|
||||
cache_ttl_seconds: 7 * 86400,
|
||||
enable_compression: true,
|
||||
)
|
||||
)
|
||||
|
||||
kreuzberg = Kreuzberg::Client.new(config)
|
||||
|
||||
puts "First extraction (will be cached)..."
|
||||
result1 = kreuzberg.extract_file('document.pdf')
|
||||
puts " - Content length: #{result1.content.length}"
|
||||
puts " - Cached: #{result1.metadata['was_cached']}"
|
||||
|
||||
puts "\nSecond extraction (from cache)..."
|
||||
result2 = kreuzberg.extract_file('document.pdf')
|
||||
puts " - Content length: #{result2.content.length}"
|
||||
puts " - Cached: #{result2.metadata['was_cached']}"
|
||||
|
||||
puts "\nResults are identical: #{result1.content == result2.content}"
|
||||
|
||||
cache_stats = kreuzberg.get_cache_stats
|
||||
puts "\nCache Statistics:"
|
||||
puts " - Total entries: #{cache_stats['total_entries']}"
|
||||
puts " - Cache size: #{(cache_stats['cache_size_bytes'] / 1024.0 / 1024.0).round(1)} MB"
|
||||
puts " - Hit rate: #{(cache_stats['hit_rate'] * 100).round(1)}%"
|
||||
```
|
||||
26
docs/snippets/ruby/cli/basic_cli.rb
Normal file
26
docs/snippets/ruby/cli/basic_cli.rb
Normal file
@@ -0,0 +1,26 @@
|
||||
```ruby title="basic_cli.rb"
|
||||
require 'json'
|
||||
require 'open3'
|
||||
|
||||
def extract_with_cli(file_path, output_format = 'text')
|
||||
stdout, stderr, status = Open3.capture3(
|
||||
'kreuzberg', 'extract', file_path, '--format', output_format
|
||||
)
|
||||
|
||||
unless status.success?
|
||||
warn "Error: #{stderr}"
|
||||
exit 1
|
||||
end
|
||||
|
||||
return JSON.parse(stdout) if output_format == 'json'
|
||||
stdout
|
||||
end
|
||||
|
||||
document = 'document.pdf'
|
||||
|
||||
text_output = extract_with_cli(document, 'text')
|
||||
puts "Extracted: #{text_output.length} characters"
|
||||
|
||||
json_output = extract_with_cli(document, 'json')
|
||||
puts "Format: #{json_output['format']}"
|
||||
```
|
||||
27
docs/snippets/ruby/cli/cli_with_config.rb
Normal file
27
docs/snippets/ruby/cli/cli_with_config.rb
Normal file
@@ -0,0 +1,27 @@
|
||||
```ruby title="cli_with_config.rb"
|
||||
require 'json'
|
||||
require 'open3'
|
||||
|
||||
def extract_with_config(file_path, config_path)
|
||||
stdout, stderr, status = Open3.capture3(
|
||||
'kreuzberg', 'extract', file_path, '--config', config_path, '--format', 'json'
|
||||
)
|
||||
|
||||
unless status.success?
|
||||
warn "Error: #{stderr}"
|
||||
exit 1
|
||||
end
|
||||
|
||||
JSON.parse(stdout)
|
||||
end
|
||||
|
||||
config_file = 'kreuzberg.toml'
|
||||
document = 'document.pdf'
|
||||
|
||||
puts "Extracting #{document} with config #{config_file}"
|
||||
result = extract_with_config(document, config_file)
|
||||
|
||||
puts "Content length: #{result['content'].length}"
|
||||
puts "Format: #{result['format']}"
|
||||
puts "Languages: #{result['languages'].join(', ')}"
|
||||
```
|
||||
22
docs/snippets/ruby/config/advanced_config.md
Normal file
22
docs/snippets/ruby/config/advanced_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu'
|
||||
),
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 100
|
||||
),
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new,
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks&.each { |chunk| puts chunk[0..100] }
|
||||
puts "Languages: #{result.detected_languages.inspect}"
|
||||
```
|
||||
55
docs/snippets/ruby/config/chunking_config.md
Normal file
55
docs/snippets/ruby/config/chunking_config.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Markdown with Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
sizing_type: "tokenizer",
|
||||
sizing_model: "Xenova/gpt-4o"
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.md", config)
|
||||
|
||||
result.chunks.each do |chunk|
|
||||
if chunk.metadata.heading_context
|
||||
puts "Headings:"
|
||||
chunk.metadata.heading_context.headings.each do |heading|
|
||||
puts " #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Prepend Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
prepend_heading_context: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.md", config)
|
||||
|
||||
result.chunks.each do |chunk|
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
puts chunk.content[0, 100]
|
||||
end
|
||||
```
|
||||
10
docs/snippets/ruby/config/config_basic.md
Normal file
10
docs/snippets/ruby/config/config_basic.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
```
|
||||
6
docs/snippets/ruby/config/config_discover.md
Normal file
6
docs/snippets/ruby/config/config_discover.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.discover
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
```
|
||||
11
docs/snippets/ruby/config/config_ocr.md
Normal file
11
docs/snippets/ruby/config/config_ocr.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+fra',
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
|
||||
)
|
||||
)
|
||||
```
|
||||
19
docs/snippets/ruby/config/config_programmatic.md
Normal file
19
docs/snippets/ruby/config/config_programmatic.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract: Kreuzberg::TesseractConfig.new(psm: 6)
|
||||
),
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
puts "Content length: #{result.content.length}"
|
||||
```
|
||||
15
docs/snippets/ruby/config/document_structure_config.md
Normal file
15
docs/snippets/ruby/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Document Structure Config (Ruby)"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(include_document_structure: true)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
if result.document
|
||||
result.document['nodes'].each do |node|
|
||||
node_type = node['content']['node_type']
|
||||
text = node['content']['text'] || ''
|
||||
puts "[#{node_type}] #{text[0...80]}"
|
||||
end
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/config/element_based_output.md
Normal file
31
docs/snippets/ruby/config/element_based_output.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Element-Based Output (Ruby)"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Configure element-based output
|
||||
config = Kreuzberg::ExtractionConfig.new(output_format: 'element_based')
|
||||
|
||||
# Extract document
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
# Access elements
|
||||
result.elements.each do |element|
|
||||
puts "Type: #{element.element_type}"
|
||||
puts "Text: #{element.text[0...100]}"
|
||||
|
||||
puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
|
||||
|
||||
if element.metadata.coordinates
|
||||
coords = element.metadata.coordinates
|
||||
puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
|
||||
end
|
||||
|
||||
puts "---"
|
||||
end
|
||||
|
||||
# Filter by element type
|
||||
titles = result.elements.select { |e| e.element_type == 'title' }
|
||||
titles.each do |title|
|
||||
level = title.metadata.additional['level'] || 'unknown'
|
||||
puts "[#{level}] #{title.text}"
|
||||
end
|
||||
```
|
||||
18
docs/snippets/ruby/config/embedding_config.md
Normal file
18
docs/snippets/ruby/config/embedding_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-mpnet-base-v2'
|
||||
),
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
89
docs/snippets/ruby/config/embedding_config.rb
Normal file
89
docs/snippets/ruby/config/embedding_config.rb
Normal file
@@ -0,0 +1,89 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Preset model (recommended)
|
||||
# Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: :preset, name: "balanced" },
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: "~/.cache/kreuzberg/embeddings"
|
||||
)
|
||||
|
||||
# Available presets:
|
||||
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
# - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
# - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
# Example 2: Custom ONNX model (requires embeddings feature)
|
||||
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: {
|
||||
type: :custom,
|
||||
model_id: "BAAI/bge-small-en-v1.5",
|
||||
dimensions: 384
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: nil # Uses default: .kreuzberg/embeddings/
|
||||
)
|
||||
|
||||
# Popular ONNX-compatible models:
|
||||
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
# Example 3: Alternative Custom ONNX Model
|
||||
# For advanced users wanting different ONNX embedding models.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: {
|
||||
type: :custom,
|
||||
model_id: "sentence-transformers/all-mpnet-base-v2",
|
||||
dimensions: 768
|
||||
},
|
||||
batch_size: 16, # Larger model requires smaller batch size
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: "/var/cache/embeddings"
|
||||
)
|
||||
|
||||
|
||||
# Integration with ChunkingConfig
|
||||
# Add embeddings to your chunking configuration:
|
||||
chunking_config = Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
preset: "balanced",
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: :preset, name: "balanced" },
|
||||
batch_size: 32,
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
|
||||
extraction_config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: chunking_config
|
||||
)
|
||||
|
||||
|
||||
# Key parameter explanations:
|
||||
#
|
||||
# batch_size: Number of texts to embed at once (32-128 typical)
|
||||
# - Larger batches are faster but use more memory
|
||||
# - Smaller batches for resource-constrained environments
|
||||
#
|
||||
# normalize: Whether to normalize vectors (L2 norm)
|
||||
# - true (recommended): Enables cosine similarity in vector DBs
|
||||
# - false: Raw embedding values
|
||||
#
|
||||
# cache_dir: Where to store downloaded models
|
||||
# - nil: Uses .kreuzberg/embeddings/ in current directory
|
||||
# - String: Custom directory for model storage
|
||||
#
|
||||
# show_download_progress: Display download progress bar
|
||||
# - Useful for monitoring large model downloads
|
||||
87
docs/snippets/ruby/config/hierarchy_config.rb
Normal file
87
docs/snippets/ruby/config/hierarchy_config.rb
Normal file
@@ -0,0 +1,87 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Basic hierarchy extraction
|
||||
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
# Extract bounding box information for spatial layout awareness.
|
||||
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6, # Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox: true, # Include bounding box coordinates
|
||||
ocr_coverage_threshold: nil # No OCR coverage threshold
|
||||
)
|
||||
|
||||
pdf_config_basic = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_basic
|
||||
)
|
||||
|
||||
extraction_config_basic = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_basic
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
|
||||
|
||||
|
||||
# Example 2: Custom k_clusters for minimal structure
|
||||
# Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
# Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 3, # Minimal clustering: just 3 levels
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: nil
|
||||
)
|
||||
|
||||
pdf_config_minimal = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_minimal
|
||||
)
|
||||
|
||||
extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_minimal
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
|
||||
|
||||
|
||||
# Example 3: With OCR coverage threshold
|
||||
# Trigger OCR if less than 50% of text has font data.
|
||||
# Useful for documents with mixed digital and scanned content.
|
||||
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.5 # Trigger OCR if text coverage < 50%
|
||||
)
|
||||
|
||||
pdf_config_ocr = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_ocr
|
||||
)
|
||||
|
||||
extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_ocr
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
|
||||
|
||||
|
||||
# Field descriptions:
|
||||
#
|
||||
# enabled: boolean (default: true)
|
||||
# - Enable or disable hierarchy extraction
|
||||
# - When false, hierarchy structure is not analyzed
|
||||
#
|
||||
# k_clusters: integer (default: 6, valid: 1-7)
|
||||
# - Number of font size clusters for hierarchy levels
|
||||
# - 6 provides H1-H6 heading levels with body text
|
||||
# - Higher values create more fine-grained hierarchy
|
||||
# - Lower values create simpler structure
|
||||
#
|
||||
# include_bbox: boolean (default: true)
|
||||
# - Include bounding box coordinates in hierarchy blocks
|
||||
# - Required for spatial layout awareness and document structure
|
||||
# - Set to false only if space optimization is critical
|
||||
#
|
||||
# ocr_coverage_threshold: float | nil (default: nil)
|
||||
# - Range: 0.0 to 1.0
|
||||
# - Triggers OCR when text block coverage falls below this fraction
|
||||
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
# - nil means no OCR coverage-based triggering
|
||||
14
docs/snippets/ruby/config/html_output.md
Normal file
14
docs/snippets/ruby/config/html_output.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
output_format: 'html',
|
||||
html_output: Kreuzberg::HtmlOutputConfig.new(
|
||||
theme: 'git_hub',
|
||||
embed_css: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', nil, config)
|
||||
puts result.content # HTML with kb-* classes
|
||||
```
|
||||
65
docs/snippets/ruby/config/keyword_config.rb
Normal file
65
docs/snippets/ruby/config/keyword_config.rb
Normal file
@@ -0,0 +1,65 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Basic YAKE configuration
|
||||
# Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
def basic_yake
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.0,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en',
|
||||
yake_params: nil,
|
||||
rake_params: nil
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
# Example 2: Advanced YAKE with custom parameters
|
||||
# Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
def advanced_yake
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :yake,
|
||||
max_keywords: 15,
|
||||
min_score: 0.1,
|
||||
ngram_range: [1, 2],
|
||||
language: 'en',
|
||||
yake_params: Kreuzberg::YakeParams.new(
|
||||
window_size: 1
|
||||
),
|
||||
rake_params: nil
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
# Example 3: RAKE configuration
|
||||
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
def rake_config
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :rake,
|
||||
max_keywords: 10,
|
||||
min_score: 5.0,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en',
|
||||
yake_params: nil,
|
||||
rake_params: Kreuzberg::RakeParams.new(
|
||||
min_word_length: 1,
|
||||
max_words_per_phrase: 3
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
basic_yake if __FILE__ == $0
|
||||
13
docs/snippets/ruby/config/keyword_extraction_config.md
Normal file
13
docs/snippets/ruby/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en'
|
||||
)
|
||||
)
|
||||
```
|
||||
11
docs/snippets/ruby/config/language_detection_config.md
Normal file
11
docs/snippets/ruby/config/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false
|
||||
)
|
||||
)
|
||||
```
|
||||
10
docs/snippets/ruby/config/ocr_dpi_config.md
Normal file
10
docs/snippets/ruby/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
|
||||
pdf: Kreuzberg::PdfConfig.new(dpi: 300)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
```
|
||||
16
docs/snippets/ruby/config/pdf_config.md
Normal file
16
docs/snippets/ruby/config/pdf_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
extract_metadata: true,
|
||||
passwords: ['password1', 'password2'],
|
||||
hierarchy: Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
29
docs/snippets/ruby/config/pdf_hierarchy_config.md
Normal file
29
docs/snippets/ruby/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Using keyword arguments with defaults
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
hierarchy: Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Using hash syntax alternative
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
hierarchy: {
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.8
|
||||
}
|
||||
)
|
||||
)
|
||||
```
|
||||
11
docs/snippets/ruby/config/postprocessor_config.md
Normal file
11
docs/snippets/ruby/config/postprocessor_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: Kreuzberg::PostProcessorConfig.new(
|
||||
enabled: true,
|
||||
enabled_processors: ['deduplication', 'whitespace_normalization'],
|
||||
disabled_processors: ['mojibake_fix']
|
||||
)
|
||||
)
|
||||
```
|
||||
7
docs/snippets/ruby/config/quality_processing_config.md
Normal file
7
docs/snippets/ruby/config/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
```
|
||||
16
docs/snippets/ruby/config/tesseract_config.md
Normal file
16
docs/snippets/ruby/config/tesseract_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
language: 'eng+fra+deu',
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(
|
||||
psm: 6,
|
||||
oem: 1,
|
||||
min_confidence: 0.8,
|
||||
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
|
||||
enable_table_detection: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
12
docs/snippets/ruby/config/token_reduction_config.md
Normal file
12
docs/snippets/ruby/config/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: 'eng'
|
||||
)
|
||||
)
|
||||
```
|
||||
63
docs/snippets/ruby/docker/usage.rb
Normal file
63
docs/snippets/ruby/docker/usage.rb
Normal file
@@ -0,0 +1,63 @@
|
||||
```ruby title="usage.rb"
|
||||
require 'open3'
|
||||
require 'net/http'
|
||||
require 'json'
|
||||
|
||||
class DockerKreuzbergClient
|
||||
def initialize(container_name = 'kreuzberg-api', api_port = 8000)
|
||||
@container_name = container_name
|
||||
@api_port = api_port
|
||||
@api_url = "http://localhost:#{api_port}/api/extract"
|
||||
end
|
||||
|
||||
def start_container(image = 'kreuzberg:latest')
|
||||
puts 'Starting Kreuzberg Docker container...'
|
||||
cmd = "docker run -d --name #{@container_name} -p #{@api_port}:8000 #{image}"
|
||||
stdout, stderr, status = Open3.capture3(cmd)
|
||||
|
||||
raise "Failed to start container: #{stderr}" unless status.success?
|
||||
|
||||
puts "Container started on http://localhost:#{@api_port}"
|
||||
end
|
||||
|
||||
def extract_file(file_path)
|
||||
file_content = File.read(file_path, mode: 'rb')
|
||||
boundary = "----WebKitFormBoundary#{SecureRandom.hex(16)}"
|
||||
|
||||
body = "--#{boundary}\r\n"
|
||||
body += "Content-Disposition: form-data; name=\"file\"; filename=\"#{File.basename(file_path)}\"\r\n"
|
||||
body += "Content-Type: application/octet-stream\r\n\r\n"
|
||||
body += file_content
|
||||
body += "\r\n--#{boundary}--\r\n"
|
||||
|
||||
uri = URI(@api_url)
|
||||
http = Net::HTTP.new(uri.host, uri.port)
|
||||
request = Net::HTTP::Post.new(uri.path)
|
||||
request['Content-Type'] = "multipart/form-data; boundary=#{boundary}"
|
||||
request.body = body
|
||||
|
||||
response = http.request(request)
|
||||
result = JSON.parse(response.body)
|
||||
result['content']
|
||||
end
|
||||
|
||||
def stop_container
|
||||
puts 'Stopping Kreuzberg Docker container...'
|
||||
system("docker stop #{@container_name}")
|
||||
system("docker rm #{@container_name}")
|
||||
puts 'Container stopped and removed'
|
||||
end
|
||||
end
|
||||
|
||||
docker_client = DockerKreuzbergClient.new
|
||||
|
||||
begin
|
||||
docker_client.start_container
|
||||
sleep(2)
|
||||
|
||||
content = docker_client.extract_file('document.pdf')
|
||||
puts "Extracted content:\n#{content}"
|
||||
ensure
|
||||
docker_client.stop_container
|
||||
end
|
||||
```
|
||||
15
docs/snippets/ruby/getting-started/basic_usage.md
Normal file
15
docs/snippets/ruby/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
puts "Content:"
|
||||
puts result.content
|
||||
|
||||
puts "\nMetadata:"
|
||||
puts "Title: #{result.metadata&.dig('title')}"
|
||||
puts "Author: #{result.metadata&.dig('author')}"
|
||||
|
||||
puts "\nTables found: #{result.tables.length}"
|
||||
puts "Images found: #{result.images.length}"
|
||||
```
|
||||
14
docs/snippets/ruby/getting-started/extract_file.md
Normal file
14
docs/snippets/ruby/getting-started/extract_file.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
|
||||
|
||||
puts "Extracted #{result.content.length} characters"
|
||||
puts "Quality score: #{result.quality_score}"
|
||||
puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
||||
```
|
||||
15
docs/snippets/ruby/getting-started/extract_with_ocr.md
Normal file
15
docs/snippets/ruby/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
ocr_config = Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
|
||||
puts "Extracted text from scanned document:"
|
||||
puts result.content
|
||||
puts "Used OCR backend: tesseract"
|
||||
```
|
||||
7
docs/snippets/ruby/getting-started/hello_world.md
Normal file
7
docs/snippets/ruby/getting-started/hello_world.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
puts "Extracted content:"
|
||||
puts result.content[0...200]
|
||||
```
|
||||
9
docs/snippets/ruby/getting-started/install_verify.md
Normal file
9
docs/snippets/ruby/getting-started/install_verify.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
puts "Kreuzberg version: #{Kreuzberg::VERSION}"
|
||||
puts "FFI bindings loaded successfully"
|
||||
|
||||
result = Kreuzberg.extract_file_sync('sample.pdf')
|
||||
puts "Installation verified! Extracted #{result.content.length} characters"
|
||||
```
|
||||
15
docs/snippets/ruby/getting-started/read_content.md
Normal file
15
docs/snippets/ruby/getting-started/read_content.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
content = result.content
|
||||
tables = result.tables
|
||||
images = result.images
|
||||
metadata = result.metadata
|
||||
|
||||
puts "Content: #{content.length} characters"
|
||||
puts "Tables: #{tables.length}"
|
||||
puts "Images: #{images.length}"
|
||||
puts "Metadata keys: #{metadata&.keys&.join(', ')}"
|
||||
```
|
||||
27
docs/snippets/ruby/llm/structured_extraction.md
Normal file
27
docs/snippets/ruby/llm/structured_extraction.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'json'
|
||||
|
||||
schema = {
|
||||
type: 'object',
|
||||
properties: {
|
||||
title: { type: 'string' },
|
||||
authors: { type: 'array', items: { type: 'string' } },
|
||||
date: { type: 'string' }
|
||||
},
|
||||
required: %w[title authors date],
|
||||
additionalProperties: false
|
||||
}
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
structured_extraction: Kreuzberg::StructuredExtractionConfig.new(
|
||||
schema: JSON.generate(schema),
|
||||
schema_name: 'PaperMetadata',
|
||||
strict: true,
|
||||
llm: Kreuzberg::LlmConfig.new(model: 'openai/gpt-4o-mini')
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('paper.pdf', config: config)
|
||||
puts result.structured_output
|
||||
```
|
||||
21
docs/snippets/ruby/mcp/mcp_custom_client.md
Normal file
21
docs/snippets/ruby/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```ruby title="Ruby"
|
||||
require 'json'
|
||||
require 'open3'
|
||||
|
||||
Open3.popen3('kreuzberg', 'mcp') do |stdin, stdout, stderr, wait_thr|
|
||||
request = {
|
||||
method: 'tools/call',
|
||||
params: {
|
||||
name: 'extract_file',
|
||||
arguments: { path: 'document.pdf', async: true }
|
||||
}
|
||||
}
|
||||
|
||||
stdin.puts JSON.generate(request)
|
||||
stdin.close_write
|
||||
|
||||
response = stdout.gets
|
||||
result = JSON.parse(response)
|
||||
puts JSON.pretty_generate(result)
|
||||
end
|
||||
```
|
||||
12
docs/snippets/ruby/mcp/mcp_server_start.md
Normal file
12
docs/snippets/ruby/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'open3'
|
||||
|
||||
begin
|
||||
Open3.popen3('kreuzberg', 'mcp') do |stdin, stdout, stderr, wait_thr|
|
||||
puts stdout.read
|
||||
wait_thr.join
|
||||
end
|
||||
rescue => e
|
||||
puts "Failed to start MCP server: #{e.message}"
|
||||
end
|
||||
```
|
||||
11
docs/snippets/ruby/metadata/language_detection.md
Normal file
11
docs/snippets/ruby/metadata/language_detection.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.9,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
|
||||
|
||||
puts "Detected languages: #{result.detected_languages}"
|
||||
# Output: ['eng', 'fra', 'deu']
|
||||
```
|
||||
74
docs/snippets/ruby/metadata/metadata.md
Normal file
74
docs/snippets/ruby/metadata/metadata.md
Normal file
@@ -0,0 +1,74 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
if metadata['page_count']
|
||||
puts "Pages: #{metadata['page_count']}"
|
||||
end
|
||||
if metadata['title']
|
||||
puts "Title: #{metadata['title']}"
|
||||
end
|
||||
if metadata['authors']
|
||||
puts "Authors: #{metadata['authors'].join(', ')}"
|
||||
end
|
||||
|
||||
# Access HTML metadata
|
||||
html_result = Kreuzberg.extract_file_sync('page.html')
|
||||
metadata = html_result.metadata
|
||||
if metadata['title']
|
||||
puts "Title: #{metadata['title']}"
|
||||
end
|
||||
if metadata['description']
|
||||
puts "Description: #{metadata['description']}"
|
||||
end
|
||||
|
||||
# Access keywords as array
|
||||
if metadata['keywords']
|
||||
puts "Keywords: #{metadata['keywords'].join(', ')}"
|
||||
end
|
||||
|
||||
# Access canonical URL (renamed from canonical)
|
||||
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
|
||||
|
||||
# Access Open Graph fields from map
|
||||
open_graph = metadata['open_graph'] || {}
|
||||
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
|
||||
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
|
||||
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
|
||||
|
||||
# Access Twitter Card fields from map
|
||||
twitter_card = metadata['twitter_card'] || {}
|
||||
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
|
||||
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
|
||||
|
||||
# Access new fields
|
||||
puts "Language: #{metadata['language']}" if metadata['language']
|
||||
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
|
||||
|
||||
# Access headers
|
||||
if metadata['headers']
|
||||
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
|
||||
end
|
||||
|
||||
# Access links
|
||||
if metadata['links']
|
||||
metadata['links'].each do |link|
|
||||
puts "Link: #{link['href']} (#{link['text']})"
|
||||
end
|
||||
end
|
||||
|
||||
# Access images
|
||||
if metadata['images']
|
||||
metadata['images'].each do |image|
|
||||
puts "Image: #{image['src']}"
|
||||
end
|
||||
end
|
||||
|
||||
# Access structured data
|
||||
if metadata['structured_data']
|
||||
puts "Structured data items: #{metadata['structured_data'].length}"
|
||||
end
|
||||
```
|
||||
18
docs/snippets/ruby/metadata/page_boundaries.md
Normal file
18
docs/snippets/ruby/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
if result.metadata.pages&.boundaries
|
||||
content_bytes = result.content.bytes
|
||||
|
||||
result.metadata.pages.boundaries.take(3).each do |boundary|
|
||||
page_bytes = content_bytes[boundary.byte_start...boundary.byte_end]
|
||||
page_text = page_bytes.pack('C*').force_encoding('UTF-8')
|
||||
|
||||
puts "Page #{boundary.page_number}:"
|
||||
puts " Byte range: #{boundary.byte_start}-#{boundary.byte_end}"
|
||||
puts " Preview: #{page_text[0..100]}..."
|
||||
end
|
||||
end
|
||||
```
|
||||
16
docs/snippets/ruby/metadata/page_tracking_basic.md
Normal file
16
docs/snippets/ruby/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,16 @@
|
||||
Require 'Kreuzberg'
|
||||
|
||||
Config = Kreuzberg::ExtractionConfig.new(
|
||||
pages: Kreuzberg::PageConfig.new(
|
||||
extract_pages: true
|
||||
)
|
||||
)
|
||||
|
||||
Result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
|
||||
Result.pages&.each do |page|
|
||||
puts "Page #{page.page_number}:"
|
||||
puts " Content: #{page.content.length} chars"
|
||||
puts " Tables: #{page.tables.length}"
|
||||
puts " Images: #{page.images.length}"
|
||||
end
|
||||
16
docs/snippets/ruby/metadata/tables.md
Normal file
16
docs/snippets/ruby/metadata/tables.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf')
|
||||
|
||||
# Iterate over tables
|
||||
result.tables.each do |table|
|
||||
puts "Table with #{table['cells'].length} rows"
|
||||
puts table['markdown'] # Markdown representation
|
||||
|
||||
# Access cells
|
||||
table['cells'].each do |row|
|
||||
puts row
|
||||
end
|
||||
end
|
||||
```
|
||||
26
docs/snippets/ruby/metadata/vector_database_integration.md
Normal file
26
docs/snippets/ruby/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks.each_with_index do |chunk, i|
|
||||
if chunk.embedding
|
||||
puts "Chunk #{i}: #{chunk.embedding.length} dimensions"
|
||||
# Store in vector database
|
||||
end
|
||||
end
|
||||
```
|
||||
32
docs/snippets/ruby/ocr/cloud_ocr_backend.md
Normal file
32
docs/snippets/ruby/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'net/http'
|
||||
|
||||
class CloudOcrBackend
|
||||
def name
|
||||
'cloud-ocr'
|
||||
end
|
||||
|
||||
def supported_languages
|
||||
%w[eng fra deu]
|
||||
end
|
||||
|
||||
def process_image(image_data, language)
|
||||
uri = URI('https://api.example.com/ocr')
|
||||
req = Net::HTTP::Post.new(uri)
|
||||
req['Authorization'] = "Bearer #{ENV['OCR_API_KEY']}"
|
||||
req.body = image_data
|
||||
res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { |h| h.request(req) }
|
||||
raise StandardError, res.message unless res.is_a?(Net::HTTPSuccess)
|
||||
{ content: JSON.parse(res.body)['text'] }
|
||||
rescue StandardError => e
|
||||
raise StandardError, e.message
|
||||
end
|
||||
end
|
||||
|
||||
Kreuzberg.register_ocr_backend(CloudOcrBackend.new)
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'cloud-ocr')
|
||||
)
|
||||
Kreuzberg.extract_file_sync('doc.pdf', config: config)
|
||||
```
|
||||
13
docs/snippets/ruby/ocr/image_extraction.md
Normal file
13
docs/snippets/ruby/ocr/image_extraction.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
images: Kreuzberg::ImageExtractionConfig.new(
|
||||
extract_images: true,
|
||||
target_dpi: 200,
|
||||
max_image_dimension: 2048,
|
||||
inject_placeholders: true, # set to false to extract images without markdown references
|
||||
auto_adjust_dpi: true
|
||||
)
|
||||
)
|
||||
```
|
||||
17
docs/snippets/ruby/ocr/image_preprocessing.md
Normal file
17
docs/snippets/ruby/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(
|
||||
preprocessing: Kreuzberg::ImagePreprocessingConfig.new(
|
||||
target_dpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrast_enhance: true,
|
||||
binarization_method: 'otsu'
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
14
docs/snippets/ruby/ocr/ocr_easyocr.md
Normal file
14
docs/snippets/ruby/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'easyocr',
|
||||
language: 'eng'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content[0..100]
|
||||
puts "Total length: #{result.content.length}"
|
||||
```
|
||||
22
docs/snippets/ruby/ocr/ocr_elements.md
Normal file
22
docs/snippets/ruby/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'paddleocr',
|
||||
language: 'eng'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
|
||||
result.ocr_elements&.each do |element|
|
||||
puts "Text: #{element.text}"
|
||||
puts "Confidence: #{format('%.2f', element.confidence.recognition)}"
|
||||
puts "Geometry: #{element.geometry}"
|
||||
if element.rotation
|
||||
puts "Rotation: #{element.rotation.angle}°"
|
||||
end
|
||||
puts
|
||||
end
|
||||
```
|
||||
12
docs/snippets/ruby/ocr/ocr_extraction.md
Normal file
12
docs/snippets/ruby/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
ocr_config = Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
11
docs/snippets/ruby/ocr/ocr_force_all_pages.md
Normal file
11
docs/snippets/ruby/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
|
||||
force_ocr: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
13
docs/snippets/ruby/ocr/ocr_multi_language.md
Normal file
13
docs/snippets/ruby/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu+fra'
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual.pdf', config: config)
|
||||
puts result.content
|
||||
```
|
||||
15
docs/snippets/ruby/ocr/ocr_paddleocr.md
Normal file
15
docs/snippets/ruby/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'paddleocr',
|
||||
language: 'eng'
|
||||
# model_tier: 'server' # for max accuracy
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
puts result.content[0..100]
|
||||
puts "Total length: #{result.content.length}"
|
||||
```
|
||||
6
docs/snippets/ruby/plugins/clear_plugins.md
Normal file
6
docs/snippets/ruby/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
Kreuzberg.clear_post_processors
|
||||
Kreuzberg.clear_validators
|
||||
```
|
||||
44
docs/snippets/ruby/plugins/embedding_backend.md
Normal file
44
docs/snippets/ruby/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Wrap an already-loaded embedder so kreuzberg can call back into it during
|
||||
# chunking and standalone embed requests. The Ruby object must respond to
|
||||
# `dimensions` and `embed`; `version`, `initialize`, and `shutdown` are
|
||||
# optional lifecycle hooks.
|
||||
class MyEmbedder
|
||||
def version
|
||||
'1.0.0'
|
||||
end
|
||||
|
||||
def initialize_plugin
|
||||
# Optional warm-up; runs once at registration.
|
||||
end
|
||||
|
||||
def shutdown
|
||||
# Optional cleanup.
|
||||
end
|
||||
|
||||
# Captured once at registration; the dispatcher uses this for shape validation.
|
||||
def dimensions
|
||||
768
|
||||
end
|
||||
|
||||
def embed(texts)
|
||||
# Delegate to the already-loaded host model.
|
||||
texts.map { Array.new(768, 0.0) }
|
||||
end
|
||||
end
|
||||
|
||||
# Register once at startup. The second argument is the plugin name used to
|
||||
# reference the backend from EmbeddingConfig.
|
||||
Kreuzberg.register_embedding_backend(MyEmbedder.new, 'my-embedder')
|
||||
|
||||
config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: 'plugin', name: 'my-embedder' },
|
||||
# Optional: bound the wait on a hung backend (default 60s; nil disables).
|
||||
max_embed_duration_secs: 30
|
||||
)
|
||||
|
||||
vectors = Kreuzberg.embed_texts(['Hello, world!', 'Second text'], config: config)
|
||||
puts "Generated #{vectors.length} vectors"
|
||||
```
|
||||
15
docs/snippets/ruby/plugins/extractor_registration.md
Normal file
15
docs/snippets/ruby/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Register custom extractor with priority 50
|
||||
Kreuzberg.register_document_extractor(
|
||||
name: "custom-json-extractor",
|
||||
extractor: ->(content, mime_type, config) {
|
||||
JSON.parse(content.to_s)
|
||||
},
|
||||
priority: 50
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.json")
|
||||
puts "Extracted content length: #{result.content.length}"
|
||||
```
|
||||
11
docs/snippets/ruby/plugins/list_plugins.md
Normal file
11
docs/snippets/ruby/plugins/list_plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
processors = Kreuzberg.list_post_processors
|
||||
validators = Kreuzberg.list_validators
|
||||
backends = Kreuzberg.list_ocr_backends
|
||||
|
||||
puts "Post-processors: #{processors.inspect}"
|
||||
puts "Validators: #{validators.inspect}"
|
||||
puts "OCR backends: #{backends.inspect}"
|
||||
```
|
||||
32
docs/snippets/ruby/plugins/min_length_validator.md
Normal file
32
docs/snippets/ruby/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
# Custom validator ensuring extracted content meets minimum length requirement
|
||||
class MinLengthValidator
|
||||
def initialize(min_length: 100)
|
||||
@min_length = min_length
|
||||
end
|
||||
|
||||
def call(result)
|
||||
content_length = result["content"].length
|
||||
if content_length < @min_length
|
||||
raise StandardError,
|
||||
"Content too short: #{content_length} < #{@min_length}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register the validator with priority
|
||||
validator = MinLengthValidator.new(min_length: 100)
|
||||
Kreuzberg.register_validator("min_length_validator", validator, 100)
|
||||
|
||||
# Usage in extraction
|
||||
config = Kreuzberg::ExtractionConfig.new
|
||||
|
||||
begin
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
puts "Extraction successful: #{result["content"].length} characters"
|
||||
rescue StandardError => e
|
||||
puts "Validation failed: #{e.message}"
|
||||
end
|
||||
```
|
||||
27
docs/snippets/ruby/plugins/pdf_metadata_extractor.md
Normal file
27
docs/snippets/ruby/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class PdfMetadataExtractor
|
||||
def initialize
|
||||
@count = 0
|
||||
end
|
||||
|
||||
def call(result)
|
||||
return result unless result['mime_type'] == 'application/pdf'
|
||||
@count += 1
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['pdf_order'] = @count
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
extractor = PdfMetadataExtractor.new
|
||||
Kreuzberg.register_post_processor('pdf_metadata', extractor)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('report.pdf', config: config)
|
||||
puts "Metadata: #{result.metadata.inspect}"
|
||||
```
|
||||
22
docs/snippets/ruby/plugins/pdf_only_processor.md
Normal file
22
docs/snippets/ruby/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class PdfOnlyProcessor
|
||||
def call(result)
|
||||
return result unless result['mime_type'] == 'application/pdf'
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['pdf_processed'] = true
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
processor = PdfOnlyProcessor.new
|
||||
Kreuzberg.register_post_processor('pdf_only', processor)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Metadata: #{result.metadata.inspect}"
|
||||
```
|
||||
23
docs/snippets/ruby/plugins/plugin_extractor.md
Normal file
23
docs/snippets/ruby/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class CustomPostProcessor
|
||||
def call(result)
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['processed_by'] = 'CustomPostProcessor'
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
class CustomValidator
|
||||
def call(result)
|
||||
raise StandardError, 'Empty' if result['content'].empty?
|
||||
end
|
||||
end
|
||||
|
||||
processor = CustomPostProcessor.new
|
||||
validator = CustomValidator.new
|
||||
|
||||
Kreuzberg.register_post_processor('custom', processor)
|
||||
Kreuzberg.register_validator('custom', validator)
|
||||
```
|
||||
29
docs/snippets/ruby/plugins/plugin_logging.md
Normal file
29
docs/snippets/ruby/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'logger'
|
||||
|
||||
logger = Logger.new($stdout)
|
||||
|
||||
class LoggingPostProcessor
|
||||
def call(result)
|
||||
puts "Processing: #{result['mime_type']}"
|
||||
puts "Content: #{result['content'].length} bytes"
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
class LoggingValidator
|
||||
def call(result)
|
||||
puts "Validating: #{result['content'].length} bytes"
|
||||
raise StandardError, 'Too short' if result['content'].length < 50
|
||||
end
|
||||
end
|
||||
|
||||
processor = LoggingPostProcessor.new
|
||||
validator = LoggingValidator.new
|
||||
|
||||
Kreuzberg.register_post_processor('logging-proc', processor)
|
||||
Kreuzberg.register_validator('logging-val', validator)
|
||||
|
||||
logger.info('Plugins registered')
|
||||
```
|
||||
22
docs/snippets/ruby/plugins/plugin_testing.md
Normal file
22
docs/snippets/ruby/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'rspec'
|
||||
|
||||
describe 'Plugin Testing' do
|
||||
it 'registers and calls post-processor' do
|
||||
processor = ->(result) { result['metadata'] ||= {}; result }
|
||||
Kreuzberg.register_post_processor('test', processor)
|
||||
expect(Kreuzberg.list_post_processors).to include('test')
|
||||
Kreuzberg.unregister_post_processor('test')
|
||||
end
|
||||
|
||||
it 'registers and validates' do
|
||||
validator = ->(result) do
|
||||
raise StandardError, 'Too short' if result['content'].length < 10
|
||||
end
|
||||
Kreuzberg.register_validator('test-val', validator)
|
||||
expect(Kreuzberg.list_validators).to include('test-val')
|
||||
Kreuzberg.unregister_validator('test-val')
|
||||
end
|
||||
end
|
||||
```
|
||||
14
docs/snippets/ruby/plugins/plugin_validator.md
Normal file
14
docs/snippets/ruby/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
validator = lambda do |result|
|
||||
raise StandardError, "Content too short" if result.content.length < 50
|
||||
end
|
||||
|
||||
Kreuzberg.register_validator("min_length", validator, priority: 10)
|
||||
|
||||
result = Kreuzberg.extract_file_sync("document.pdf")
|
||||
puts "Validated content length: #{result.content.length}"
|
||||
|
||||
Kreuzberg.unregister_validator("min_length")
|
||||
```
|
||||
35
docs/snippets/ruby/plugins/quality_score_validator.md
Normal file
35
docs/snippets/ruby/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
# Custom validator checking document quality score
|
||||
class QualityScoreValidator
|
||||
def initialize(min_score: 0.5)
|
||||
@min_score = min_score
|
||||
end
|
||||
|
||||
def call(result)
|
||||
quality_score = result.quality_score || 0.0
|
||||
|
||||
if quality_score < @min_score
|
||||
raise StandardError,
|
||||
format("Quality score too low: %.2f < %.2f", quality_score, @min_score)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register with default minimum score of 0.5
|
||||
validator = QualityScoreValidator.new(min_score: 0.5)
|
||||
Kreuzberg.register_validator("quality_score_check", validator)
|
||||
|
||||
# Usage with quality processing enabled
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
begin
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
puts "Document quality verified: #{result.quality_score}"
|
||||
rescue StandardError => e
|
||||
puts "Quality check failed: #{e.message}"
|
||||
end
|
||||
```
|
||||
29
docs/snippets/ruby/plugins/stateful_plugin.md
Normal file
29
docs/snippets/ruby/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class StatefulPlugin
|
||||
def initialize
|
||||
@lock = Mutex.new
|
||||
@count = 0
|
||||
end
|
||||
|
||||
def call(result)
|
||||
@lock.synchronize { @count += 1 }
|
||||
result
|
||||
end
|
||||
|
||||
def count
|
||||
@lock.synchronize { @count }
|
||||
end
|
||||
end
|
||||
|
||||
plugin = StatefulPlugin.new
|
||||
Kreuzberg.register_post_processor('counter', plugin)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Processed: #{plugin.count}"
|
||||
```
|
||||
7
docs/snippets/ruby/plugins/unregister_plugins.md
Normal file
7
docs/snippets/ruby/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
Kreuzberg.unregister_post_processor('word_count')
|
||||
Kreuzberg.unregister_validator('min_length_validator')
|
||||
Kreuzberg.unregister_ocr_backend('cloud-ocr')
|
||||
```
|
||||
23
docs/snippets/ruby/plugins/word_count_processor.md
Normal file
23
docs/snippets/ruby/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class WordCountProcessor
|
||||
def call(result)
|
||||
return result if result['content'].empty?
|
||||
word_count = result['content'].split.length
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['word_count'] = word_count
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
processor = WordCountProcessor.new
|
||||
Kreuzberg.register_post_processor('word_count', processor, 10)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Words: #{result.metadata['word_count']}"
|
||||
```
|
||||
16
docs/snippets/ruby/utils/chunking.md
Normal file
16
docs/snippets/ruby/utils/chunking.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1500,
|
||||
overlap: 200,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'text-embedding-all-minilm-l6-v2'
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
26
docs/snippets/ruby/utils/chunking_rag.md
Normal file
26
docs/snippets/ruby/utils/chunking_rag.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
result.chunks.each_with_index do |chunk, i|
|
||||
puts "Chunk #{i + 1}/#{result.chunks.length}"
|
||||
puts "Position: #{chunk.metadata[:byte_start]}-#{chunk.metadata[:byte_end]}"
|
||||
puts "Content: #{chunk.content[0..99]}..."
|
||||
puts "Embedding: #{chunk.embedding.length} dimensions" if chunk.embedding
|
||||
end
|
||||
```
|
||||
19
docs/snippets/ruby/utils/embedding_with_chunking.md
Normal file
19
docs/snippets/ruby/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
18
docs/snippets/ruby/utils/keyword_extraction_example.md
Normal file
18
docs/snippets/ruby/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
keywords = result.extracted_keywords
|
||||
keywords.each do |kw|
|
||||
puts "#{kw['text']}: #{kw['score'].round(3)}"
|
||||
end
|
||||
```
|
||||
17
docs/snippets/ruby/utils/quality_processing_example.md
Normal file
17
docs/snippets/ruby/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
|
||||
|
||||
quality_score = result.quality_score
|
||||
|
||||
if quality_score < 0.5
|
||||
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
|
||||
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
else
|
||||
puts "Quality score: #{quality_score.round(2)}"
|
||||
end
|
||||
```
|
||||
15
docs/snippets/ruby/utils/standalone_embed.md
Normal file
15
docs/snippets/ruby/utils/standalone_embed.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
config = { model: { type: "preset", name: "balanced" }, normalize: true }
|
||||
texts = ["Hello, world!", "Kreuzberg is fast"]
|
||||
|
||||
# Synchronous
|
||||
embeddings = Kreuzberg.embed_sync(texts: texts, config: config)
|
||||
puts embeddings.length # 2
|
||||
puts embeddings[0].length # 768
|
||||
|
||||
# Async variant (uses same thread, returns when done)
|
||||
embeddings = Kreuzberg.embed(texts: texts, config: config)
|
||||
puts embeddings[0].length # 768
|
||||
```
|
||||
10
docs/snippets/ruby/utils/token_reduction.md
Normal file
10
docs/snippets/ruby/utils/token_reduction.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_important_words: true
|
||||
)
|
||||
)
|
||||
```
|
||||
20
docs/snippets/ruby/utils/token_reduction_example.md
Normal file
20
docs/snippets/ruby/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
|
||||
|
||||
# Check reduction statistics in metadata
|
||||
original_tokens = result.metadata['original_token_count']
|
||||
reduced_tokens = result.metadata['token_count']
|
||||
reduction_ratio = result.metadata['token_reduction_ratio']
|
||||
|
||||
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
|
||||
puts "Reduction: #{reduction_ratio * 100}%"
|
||||
```
|
||||
Reference in New Issue
Block a user