Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng+deu'
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 100
),
language_detection: Kreuzberg::LanguageDetectionConfig.new,
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each { |chunk| puts chunk[0..100] }
puts "Languages: #{result.detected_languages.inspect}"
```

View File

@@ -0,0 +1,55 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200
)
)
```
```ruby title="Ruby - Markdown with Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
sizing_type: "tokenizer",
sizing_model: "Xenova/gpt-4o"
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
if chunk.metadata.heading_context
puts "Headings:"
chunk.metadata.heading_context.headings.each do |heading|
puts " #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
end
end
end
```
```ruby title="Ruby - Prepend Heading Context"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
prepend_heading_context: true
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
# Each chunk's content is prefixed with its heading breadcrumb
puts chunk.content[0, 100]
end
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
```

View File

@@ -0,0 +1,6 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
backend: 'tesseract',
language: 'eng+fra',
tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
)
)
```

View File

@@ -0,0 +1,19 @@
```ruby title="Ruby"
require "kreuzberg"
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
ocr: Kreuzberg::OcrConfig.new(
backend: "tesseract",
language: "eng+deu",
tesseract: Kreuzberg::TesseractConfig.new(psm: 6)
),
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
overlap: 200
)
)
result = Kreuzberg.extract_file_sync("document.pdf", config)
puts "Content length: #{result.content.length}"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Document Structure Config (Ruby)"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(include_document_structure: true)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
if result.document
result.document['nodes'].each do |node|
node_type = node['content']['node_type']
text = node['content']['text'] || ''
puts "[#{node_type}] #{text[0...80]}"
end
end
```

View File

@@ -0,0 +1,31 @@
```ruby title="Element-Based Output (Ruby)"
require 'kreuzberg'
# Configure element-based output
config = Kreuzberg::ExtractionConfig.new(output_format: 'element_based')
# Extract document
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
# Access elements
result.elements.each do |element|
puts "Type: #{element.element_type}"
puts "Text: #{element.text[0...100]}"
puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
if element.metadata.coordinates
coords = element.metadata.coordinates
puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
end
puts "---"
end
# Filter by element type
titles = result.elements.select { |e| e.element_type == 'title' }
titles.each do |title|
level = title.metadata.additional['level'] || 'unknown'
puts "[#{level}] #{title.text}"
end
```

View File

@@ -0,0 +1,18 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
chunking: Kreuzberg::ChunkingConfig.new(
max_characters: 1000,
embedding: Kreuzberg::EmbeddingConfig.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
batch_size: 16,
normalize: true,
show_download_progress: true
)
)
)
```

View File

@@ -0,0 +1,89 @@
require 'kreuzberg'
# Example 1: Preset model (recommended)
# Fast, balanced, or quality preset configurations optimized for common use cases.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: { type: :preset, name: "balanced" },
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: "~/.cache/kreuzberg/embeddings"
)
# Available presets:
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
# - "quality" (1024 dims): Complex documents, maximum accuracy
# - "multilingual" (768 dims): International documents, 100+ languages
# Example 2: Custom ONNX model (requires embeddings feature)
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: {
type: :custom,
model_id: "BAAI/bge-small-en-v1.5",
dimensions: 384
},
batch_size: 32,
normalize: true,
show_download_progress: true,
cache_dir: nil # Uses default: .kreuzberg/embeddings/
)
# Popular ONNX-compatible models:
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
# Example 3: Alternative Custom ONNX Model
# For advanced users wanting different ONNX embedding models.
embedding_config = Kreuzberg::EmbeddingConfig.new(
model: {
type: :custom,
model_id: "sentence-transformers/all-mpnet-base-v2",
dimensions: 768
},
batch_size: 16, # Larger model requires smaller batch size
normalize: true,
show_download_progress: true,
cache_dir: "/var/cache/embeddings"
)
# Integration with ChunkingConfig
# Add embeddings to your chunking configuration:
chunking_config = Kreuzberg::ChunkingConfig.new(
max_characters: 1024,
overlap: 100,
preset: "balanced",
embedding: Kreuzberg::EmbeddingConfig.new(
model: { type: :preset, name: "balanced" },
batch_size: 32,
normalize: true
)
)
extraction_config = Kreuzberg::ExtractionConfig.new(
chunking: chunking_config
)
# Key parameter explanations:
#
# batch_size: Number of texts to embed at once (32-128 typical)
# - Larger batches are faster but use more memory
# - Smaller batches for resource-constrained environments
#
# normalize: Whether to normalize vectors (L2 norm)
# - true (recommended): Enables cosine similarity in vector DBs
# - false: Raw embedding values
#
# cache_dir: Where to store downloaded models
# - nil: Uses .kreuzberg/embeddings/ in current directory
# - String: Custom directory for model storage
#
# show_download_progress: Display download progress bar
# - Useful for monitoring large model downloads

View File

@@ -0,0 +1,87 @@
require 'kreuzberg'
# Example 1: Basic hierarchy extraction
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
# Extract bounding box information for spatial layout awareness.
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6, # Default: creates 6 font size clusters (H1-H6 structure)
include_bbox: true, # Include bounding box coordinates
ocr_coverage_threshold: nil # No OCR coverage threshold
)
pdf_config_basic = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_basic
)
extraction_config_basic = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_basic
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
# Example 2: Custom k_clusters for minimal structure
# Use 3 clusters for simpler hierarchy with minimal structure.
# Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 3, # Minimal clustering: just 3 levels
include_bbox: true,
ocr_coverage_threshold: nil
)
pdf_config_minimal = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_minimal
)
extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_minimal
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
# Example 3: With OCR coverage threshold
# Trigger OCR if less than 50% of text has font data.
# Useful for documents with mixed digital and scanned content.
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.5 # Trigger OCR if text coverage < 50%
)
pdf_config_ocr = Kreuzberg::PdfConfig.new(
hierarchy: hierarchy_config_ocr
)
extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
pdf_options: pdf_config_ocr
)
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
# Field descriptions:
#
# enabled: boolean (default: true)
# - Enable or disable hierarchy extraction
# - When false, hierarchy structure is not analyzed
#
# k_clusters: integer (default: 6, valid: 1-7)
# - Number of font size clusters for hierarchy levels
# - 6 provides H1-H6 heading levels with body text
# - Higher values create more fine-grained hierarchy
# - Lower values create simpler structure
#
# include_bbox: boolean (default: true)
# - Include bounding box coordinates in hierarchy blocks
# - Required for spatial layout awareness and document structure
# - Set to false only if space optimization is critical
#
# ocr_coverage_threshold: float | nil (default: nil)
# - Range: 0.0 to 1.0
# - Triggers OCR when text block coverage falls below this fraction
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
# - nil means no OCR coverage-based triggering

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
output_format: 'html',
html_output: Kreuzberg::HtmlOutputConfig.new(
theme: 'git_hub',
embed_css: true
)
)
result = Kreuzberg.extract_file_sync('document.pdf', nil, config)
puts result.content # HTML with kb-* classes
```

View File

@@ -0,0 +1,65 @@
require 'kreuzberg'
# Example 1: Basic YAKE configuration
# Uses YAKE algorithm with default parameters and English stopword filtering
def basic_yake
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :yake,
max_keywords: 10,
min_score: 0.0,
ngram_range: [1, 3],
language: 'en',
yake_params: nil,
rake_params: nil
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
# Example 2: Advanced YAKE with custom parameters
# Fine-tunes YAKE with custom window size for co-occurrence analysis
def advanced_yake
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :yake,
max_keywords: 15,
min_score: 0.1,
ngram_range: [1, 2],
language: 'en',
yake_params: Kreuzberg::YakeParams.new(
window_size: 1
),
rake_params: nil
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
# Example 3: RAKE configuration
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
def rake_config
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: :rake,
max_keywords: 10,
min_score: 5.0,
ngram_range: [1, 3],
language: 'en',
yake_params: nil,
rake_params: Kreuzberg::RakeParams.new(
min_word_length: 1,
max_words_per_phrase: 3
)
)
)
result = Kreuzberg.extract_file('document.pdf', config: config)
puts "Keywords: #{result.keywords}"
end
basic_yake if __FILE__ == $0

View File

@@ -0,0 +1,13 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
keywords: Kreuzberg::KeywordConfig.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3,
ngram_range: [1, 3],
language: 'en'
)
)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
language_detection: Kreuzberg::LanguageDetectionConfig.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: false
)
)
```

View File

@@ -0,0 +1,10 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
pdf: Kreuzberg::PdfConfig.new(dpi: 300)
)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
extract_metadata: true,
passwords: ['password1', 'password2'],
hierarchy: Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true
)
)
)
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
# Using keyword arguments with defaults
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
hierarchy: Kreuzberg::HierarchyConfig.new(
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
)
)
)
# Using hash syntax alternative
config = Kreuzberg::ExtractionConfig.new(
pdf_options: Kreuzberg::PdfConfig.new(
extract_images: true,
hierarchy: {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
}
)
)
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
postprocessor: Kreuzberg::PostProcessorConfig.new(
enabled: true,
enabled_processors: ['deduplication', 'whitespace_normalization'],
disabled_processors: ['mojibake_fix']
)
)
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
```

View File

@@ -0,0 +1,16 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
ocr: Kreuzberg::OcrConfig.new(
language: 'eng+fra+deu',
tesseract_config: Kreuzberg::TesseractConfig.new(
psm: 6,
oem: 1,
min_confidence: 0.8,
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
enable_table_detection: true
)
)
)
```

View File

@@ -0,0 +1,12 @@
```ruby title="Ruby"
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
token_reduction: Kreuzberg::TokenReductionConfig.new(
mode: 'moderate',
preserve_markdown: true,
preserve_code: true,
language_hint: 'eng'
)
)
```