Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(max_characters: 500, overlap: 50),
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each do |chunk|
first = chunk.metadata.first_page
last = chunk.metadata.last_page
next if first.nil?
page_range = first == last ? "Page #{first}" : "Pages #{first}-#{last}"
puts "Chunk: #{chunk.content[0..50]}... (#{page_range})"
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-minilm-l6-v2'
),
normalize: true,
batch_size: 32
)
)
)
```
```ruby title="Ruby - Prepend Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
prepend_heading_context: true
)
)
```

View File

@@ -0,0 +1,45 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 500,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
normalize: true,
batch_size: 16
)
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
vector_store = build_vector_store(result.chunks)
query = 'machine learning optimization'
relevant_chunks = search_vector_store(vector_store, query)
puts "Found #{relevant_chunks.length} relevant chunks"
relevant_chunks.take(3).each do |chunk|
puts "Content: #{chunk[:content][0..80]}..."
puts "Similarity: #{chunk[:similarity]&.round(3)}\n"
end
def build_vector_store(chunks)
chunks.map.with_index do |chunk, idx|
{
id: idx,
content: chunk.content,
embedding: chunk.embedding,
similarity: 0.0
}
end
end
def search_vector_store(store, query)
store.sort_by { |entry| entry[:similarity] }.reverse
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32,
show_download_progress: false
)
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
chunks = result.chunks || []
chunks.each_with_index do |chunk, idx|
chunk_id = "doc_chunk_#{idx}"
puts "Chunk #{chunk_id}: #{chunk.content[0...50]}"
if chunk.embedding
puts " Embedding dimensions: #{chunk.embedding.length}"
end
end
```

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3,
ngram_range: [1, 3],
language: 'en'
)
)
```

View File

@@ -0,0 +1,20 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.metadata&.dig('keywords') || []
keywords.each do |kw|
text = kw['text']
score = kw['score']
puts "#{text}: #{score.round(3)}"
end
```

View File

@@ -0,0 +1,21 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: false
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
if result.detected_languages&.any?
puts "Detected Language: #{result.detected_languages.first}"
else
puts "No language detected"
end
puts "Content length: #{result.content.length} characters"
```

View File

@@ -0,0 +1,24 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: true
)
)
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
languages = result.detected_languages || []
if languages.any?
puts "Detected #{languages.length} language(s): #{languages.join(', ')}"
else
puts "No languages detected"
end
puts "Total content: #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
quality_score = result.quality_score || 0.0
if quality_score < 0.5
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
else
puts "Quality score: #{quality_score.round(2)}"
end
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true,
preserve_code: true,
language_hint: 'eng'
)
)
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true
)
)
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
original_tokens = result.metadata&.dig('original_token_count') || 0
reduced_tokens = result.metadata&.dig('token_count') || 0
reduction_ratio = result.metadata&.dig('token_reduction_ratio') || 0.0
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
puts "Reduction: #{(reduction_ratio * 100).round(1)}%"
```

View File

@@ -0,0 +1,53 @@
```ruby title="Ruby"
require 'kreuzberg'
class VectorDatabaseIntegration
VectorRecord = Struct.new(:id, :embedding, :content, :metadata, keyword_init: true)
def extract_and_vectorize(document_path, document_id)
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32
)
)
)
result = Kreuzberg.extract_file_sync(document_path, config: config)
chunks = result.chunks || []
vector_records = chunks.map.with_index do |chunk, idx|
VectorRecord.new(
id: "#{document_id}_chunk_#{idx}",
content: chunk.content,
embedding: chunk.embedding,
metadata: {
document_id: document_id,
chunk_index: idx,
content_length: chunk.content.length
}
)
end
store_in_vector_database(vector_records)
vector_records
end
private
def store_in_vector_database(records)
records.each do |record|
if record.embedding&.any?
puts "Storing #{record.id}: #{record.content.length} chars, #{record.embedding.length} dims"
end
end
end
end
```