Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1500,
overlap: 200,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'text-embedding-all-minilm-l6-v2'
)
)
)
)
```

View File

@@ -0,0 +1,26 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 500,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true
)
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
result.chunks.each_with_index do |chunk, i|
puts "Chunk #{i + 1}/#{result.chunks.length}"
puts "Position: #{chunk.metadata[:byte_start]}-#{chunk.metadata[:byte_end]}"
puts "Content: #{chunk.content[0..99]}..."
puts "Embedding: #{chunk.embedding.length} dimensions" if chunk.embedding
end
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1024,
overlap: 100,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32,
show_download_progress: false
)
)
)
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.extracted_keywords
keywords.each do |kw|
puts "#{kw['text']}: #{kw['score'].round(3)}"
end
```

View File

@@ -0,0 +1,17 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
quality_score = result.quality_score
if quality_score < 0.5
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
else
puts "Quality score: #{quality_score.round(2)}"
end
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require "kreuzberg"
config = { model: { type: "preset", name: "balanced" }, normalize: true }
texts = ["Hello, world!", "Kreuzberg is fast"]
# Synchronous
embeddings = Kreuzberg.embed_sync(texts: texts, config: config)
puts embeddings.length # 2
puts embeddings[0].length # 768
# Async variant (uses same thread, returns when done)
embeddings = Kreuzberg.embed(texts: texts, config: config)
puts embeddings[0].length # 768
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_important_words: true
)
)
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true
)
)
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
# Check reduction statistics in metadata
original_tokens = result.metadata['original_token_count']
reduced_tokens = result.metadata['token_count']
reduction_ratio = result.metadata['token_reduction_ratio']
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
puts "Reduction: #{reduction_ratio * 100}%"
```