This commit is contained in:
18
docs/snippets/ruby/advanced/chunk_page_mapping.md
Normal file
18
docs/snippets/ruby/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(max_characters: 500, overlap: 50),
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks&.each do |chunk|
|
||||
first = chunk.metadata.first_page
|
||||
last = chunk.metadata.last_page
|
||||
next if first.nil?
|
||||
|
||||
page_range = first == last ? "Page #{first}" : "Pages #{first}-#{last}"
|
||||
puts "Chunk: #{chunk.content[0..50]}... (#{page_range})"
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/advanced/chunking_config.md
Normal file
31
docs/snippets/ruby/advanced/chunking_config.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-minilm-l6-v2'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Prepend Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
prepend_heading_context: true
|
||||
)
|
||||
)
|
||||
```
|
||||
45
docs/snippets/ruby/advanced/chunking_rag.md
Normal file
45
docs/snippets/ruby/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-mpnet-base-v2'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 16
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
vector_store = build_vector_store(result.chunks)
|
||||
query = 'machine learning optimization'
|
||||
relevant_chunks = search_vector_store(vector_store, query)
|
||||
|
||||
puts "Found #{relevant_chunks.length} relevant chunks"
|
||||
relevant_chunks.take(3).each do |chunk|
|
||||
puts "Content: #{chunk[:content][0..80]}..."
|
||||
puts "Similarity: #{chunk[:similarity]&.round(3)}\n"
|
||||
end
|
||||
|
||||
def build_vector_store(chunks)
|
||||
chunks.map.with_index do |chunk, idx|
|
||||
{
|
||||
id: idx,
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
similarity: 0.0
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def search_vector_store(store, query)
|
||||
store.sort_by { |entry| entry[:similarity] }.reverse
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/advanced/embedding_with_chunking.md
Normal file
31
docs/snippets/ruby/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
chunks = result.chunks || []
|
||||
chunks.each_with_index do |chunk, idx|
|
||||
chunk_id = "doc_chunk_#{idx}"
|
||||
puts "Chunk #{chunk_id}: #{chunk.content[0...50]}"
|
||||
|
||||
if chunk.embedding
|
||||
puts " Embedding dimensions: #{chunk.embedding.length}"
|
||||
end
|
||||
end
|
||||
```
|
||||
13
docs/snippets/ruby/advanced/keyword_extraction_config.md
Normal file
13
docs/snippets/ruby/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en'
|
||||
)
|
||||
)
|
||||
```
|
||||
20
docs/snippets/ruby/advanced/keyword_extraction_example.md
Normal file
20
docs/snippets/ruby/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
keywords = result.metadata&.dig('keywords') || []
|
||||
keywords.each do |kw|
|
||||
text = kw['text']
|
||||
score = kw['score']
|
||||
puts "#{text}: #{score.round(3)}"
|
||||
end
|
||||
```
|
||||
21
docs/snippets/ruby/advanced/language_detection_config.md
Normal file
21
docs/snippets/ruby/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
if result.detected_languages&.any?
|
||||
puts "Detected Language: #{result.detected_languages.first}"
|
||||
else
|
||||
puts "No language detected"
|
||||
end
|
||||
|
||||
puts "Content length: #{result.content.length} characters"
|
||||
```
|
||||
@@ -0,0 +1,24 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
|
||||
|
||||
languages = result.detected_languages || []
|
||||
|
||||
if languages.any?
|
||||
puts "Detected #{languages.length} language(s): #{languages.join(', ')}"
|
||||
else
|
||||
puts "No languages detected"
|
||||
end
|
||||
|
||||
puts "Total content: #{result.content.length} characters"
|
||||
puts "MIME type: #{result.mime_type}"
|
||||
```
|
||||
7
docs/snippets/ruby/advanced/quality_processing_config.md
Normal file
7
docs/snippets/ruby/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
```
|
||||
18
docs/snippets/ruby/advanced/quality_processing_example.md
Normal file
18
docs/snippets/ruby/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
|
||||
|
||||
quality_score = result.quality_score || 0.0
|
||||
|
||||
if quality_score < 0.5
|
||||
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
|
||||
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
else
|
||||
puts "Quality score: #{quality_score.round(2)}"
|
||||
end
|
||||
```
|
||||
12
docs/snippets/ruby/advanced/token_reduction_config.md
Normal file
12
docs/snippets/ruby/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: 'eng'
|
||||
)
|
||||
)
|
||||
```
|
||||
19
docs/snippets/ruby/advanced/token_reduction_example.md
Normal file
19
docs/snippets/ruby/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
|
||||
|
||||
original_tokens = result.metadata&.dig('original_token_count') || 0
|
||||
reduced_tokens = result.metadata&.dig('token_count') || 0
|
||||
reduction_ratio = result.metadata&.dig('token_reduction_ratio') || 0.0
|
||||
|
||||
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
|
||||
puts "Reduction: #{(reduction_ratio * 100).round(1)}%"
|
||||
```
|
||||
53
docs/snippets/ruby/advanced/vector_database_integration.md
Normal file
53
docs/snippets/ruby/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class VectorDatabaseIntegration
|
||||
VectorRecord = Struct.new(:id, :embedding, :content, :metadata, keyword_init: true)
|
||||
|
||||
def extract_and_vectorize(document_path, document_id)
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 512,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync(document_path, config: config)
|
||||
chunks = result.chunks || []
|
||||
|
||||
vector_records = chunks.map.with_index do |chunk, idx|
|
||||
VectorRecord.new(
|
||||
id: "#{document_id}_chunk_#{idx}",
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
metadata: {
|
||||
document_id: document_id,
|
||||
chunk_index: idx,
|
||||
content_length: chunk.content.length
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
store_in_vector_database(vector_records)
|
||||
vector_records
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def store_in_vector_database(records)
|
||||
records.each do |record|
|
||||
if record.embedding&.any?
|
||||
puts "Storing #{record.id}: #{record.content.length} chars, #{record.embedding.length} dims"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
Reference in New Issue
Block a user