This commit is contained in:
16
docs/snippets/ruby/utils/chunking.md
Normal file
16
docs/snippets/ruby/utils/chunking.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1500,
|
||||
overlap: 200,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'text-embedding-all-minilm-l6-v2'
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
26
docs/snippets/ruby/utils/chunking_rag.md
Normal file
26
docs/snippets/ruby/utils/chunking_rag.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
result.chunks.each_with_index do |chunk, i|
|
||||
puts "Chunk #{i + 1}/#{result.chunks.length}"
|
||||
puts "Position: #{chunk.metadata[:byte_start]}-#{chunk.metadata[:byte_end]}"
|
||||
puts "Content: #{chunk.content[0..99]}..."
|
||||
puts "Embedding: #{chunk.embedding.length} dimensions" if chunk.embedding
|
||||
end
|
||||
```
|
||||
19
docs/snippets/ruby/utils/embedding_with_chunking.md
Normal file
19
docs/snippets/ruby/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'balanced'
|
||||
),
|
||||
normalize: true,
|
||||
batch_size: 32,
|
||||
show_download_progress: false
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
18
docs/snippets/ruby/utils/keyword_extraction_example.md
Normal file
18
docs/snippets/ruby/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
|
||||
|
||||
keywords = result.extracted_keywords
|
||||
keywords.each do |kw|
|
||||
puts "#{kw['text']}: #{kw['score'].round(3)}"
|
||||
end
|
||||
```
|
||||
17
docs/snippets/ruby/utils/quality_processing_example.md
Normal file
17
docs/snippets/ruby/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
|
||||
|
||||
quality_score = result.quality_score
|
||||
|
||||
if quality_score < 0.5
|
||||
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
|
||||
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
|
||||
else
|
||||
puts "Quality score: #{quality_score.round(2)}"
|
||||
end
|
||||
```
|
||||
15
docs/snippets/ruby/utils/standalone_embed.md
Normal file
15
docs/snippets/ruby/utils/standalone_embed.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
config = { model: { type: "preset", name: "balanced" }, normalize: true }
|
||||
texts = ["Hello, world!", "Kreuzberg is fast"]
|
||||
|
||||
# Synchronous
|
||||
embeddings = Kreuzberg.embed_sync(texts: texts, config: config)
|
||||
puts embeddings.length # 2
|
||||
puts embeddings[0].length # 768
|
||||
|
||||
# Async variant (uses same thread, returns when done)
|
||||
embeddings = Kreuzberg.embed(texts: texts, config: config)
|
||||
puts embeddings[0].length # 768
|
||||
```
|
||||
10
docs/snippets/ruby/utils/token_reduction.md
Normal file
10
docs/snippets/ruby/utils/token_reduction.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_important_words: true
|
||||
)
|
||||
)
|
||||
```
|
||||
20
docs/snippets/ruby/utils/token_reduction_example.md
Normal file
20
docs/snippets/ruby/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
|
||||
|
||||
# Check reduction statistics in metadata
|
||||
original_tokens = result.metadata['original_token_count']
|
||||
reduced_tokens = result.metadata['token_count']
|
||||
reduction_ratio = result.metadata['token_reduction_ratio']
|
||||
|
||||
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
|
||||
puts "Reduction: #{reduction_ratio * 100}%"
|
||||
```
|
||||
Reference in New Issue
Block a user