Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
```ruby title="Ruby"
require 'kreuzberg'
Kreuzberg.clear_post_processors
Kreuzberg.clear_validators
```

View File

@@ -0,0 +1,44 @@
```ruby title="Ruby"
require 'kreuzberg'
# Wrap an already-loaded embedder so kreuzberg can call back into it during
# chunking and standalone embed requests. The Ruby object must respond to
# `dimensions` and `embed`; `version`, `initialize`, and `shutdown` are
# optional lifecycle hooks.
class MyEmbedder
def version
'1.0.0'
end
def initialize_plugin
# Optional warm-up; runs once at registration.
end
def shutdown
# Optional cleanup.
end
# Captured once at registration; the dispatcher uses this for shape validation.
def dimensions
768
end
def embed(texts)
# Delegate to the already-loaded host model.
texts.map { Array.new(768, 0.0) }
end
end
# Register once at startup. The second argument is the plugin name used to
# reference the backend from EmbeddingConfig.
Kreuzberg.register_embedding_backend(MyEmbedder.new, 'my-embedder')
config = Kreuzberg::EmbeddingConfig.new(
model: { type: 'plugin', name: 'my-embedder' },
# Optional: bound the wait on a hung backend (default 60s; nil disables).
max_embed_duration_secs: 30
)
vectors = Kreuzberg.embed_texts(['Hello, world!', 'Second text'], config: config)
puts "Generated #{vectors.length} vectors"
```

View File

@@ -0,0 +1,15 @@
```ruby title="Ruby"
require 'kreuzberg'
# Register custom extractor with priority 50
Kreuzberg.register_document_extractor(
name: "custom-json-extractor",
extractor: ->(content, mime_type, config) {
JSON.parse(content.to_s)
},
priority: 50
)
result = Kreuzberg.extract_file("document.json")
puts "Extracted content length: #{result.content.length}"
```

View File

@@ -0,0 +1,11 @@
```ruby title="Ruby"
require 'kreuzberg'
processors = Kreuzberg.list_post_processors
validators = Kreuzberg.list_validators
backends = Kreuzberg.list_ocr_backends
puts "Post-processors: #{processors.inspect}"
puts "Validators: #{validators.inspect}"
puts "OCR backends: #{backends.inspect}"
```

View File

@@ -0,0 +1,32 @@
```ruby title="Ruby"
require "kreuzberg"
# Custom validator ensuring extracted content meets minimum length requirement
class MinLengthValidator
def initialize(min_length: 100)
@min_length = min_length
end
def call(result)
content_length = result["content"].length
if content_length < @min_length
raise StandardError,
"Content too short: #{content_length} < #{@min_length}"
end
end
end
# Register the validator with priority
validator = MinLengthValidator.new(min_length: 100)
Kreuzberg.register_validator("min_length_validator", validator, 100)
# Usage in extraction
config = Kreuzberg::ExtractionConfig.new
begin
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
puts "Extraction successful: #{result["content"].length} characters"
rescue StandardError => e
puts "Validation failed: #{e.message}"
end
```

View File

@@ -0,0 +1,27 @@
```ruby title="Ruby"
require 'kreuzberg'
class PdfMetadataExtractor
def initialize
@count = 0
end
def call(result)
return result unless result['mime_type'] == 'application/pdf'
@count += 1
result['metadata'] ||= {}
result['metadata']['pdf_order'] = @count
result
end
end
extractor = PdfMetadataExtractor.new
Kreuzberg.register_post_processor('pdf_metadata', extractor)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('report.pdf', config: config)
puts "Metadata: #{result.metadata.inspect}"
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
class PdfOnlyProcessor
def call(result)
return result unless result['mime_type'] == 'application/pdf'
result['metadata'] ||= {}
result['metadata']['pdf_processed'] = true
result
end
end
processor = PdfOnlyProcessor.new
Kreuzberg.register_post_processor('pdf_only', processor)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Metadata: #{result.metadata.inspect}"
```

View File

@@ -0,0 +1,23 @@
```ruby title="Ruby"
require 'kreuzberg'
class CustomPostProcessor
def call(result)
result['metadata'] ||= {}
result['metadata']['processed_by'] = 'CustomPostProcessor'
result
end
end
class CustomValidator
def call(result)
raise StandardError, 'Empty' if result['content'].empty?
end
end
processor = CustomPostProcessor.new
validator = CustomValidator.new
Kreuzberg.register_post_processor('custom', processor)
Kreuzberg.register_validator('custom', validator)
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'logger'
logger = Logger.new($stdout)
class LoggingPostProcessor
def call(result)
puts "Processing: #{result['mime_type']}"
puts "Content: #{result['content'].length} bytes"
result
end
end
class LoggingValidator
def call(result)
puts "Validating: #{result['content'].length} bytes"
raise StandardError, 'Too short' if result['content'].length < 50
end
end
processor = LoggingPostProcessor.new
validator = LoggingValidator.new
Kreuzberg.register_post_processor('logging-proc', processor)
Kreuzberg.register_validator('logging-val', validator)
logger.info('Plugins registered')
```

View File

@@ -0,0 +1,22 @@
```ruby title="Ruby"
require 'kreuzberg'
require 'rspec'
describe 'Plugin Testing' do
it 'registers and calls post-processor' do
processor = ->(result) { result['metadata'] ||= {}; result }
Kreuzberg.register_post_processor('test', processor)
expect(Kreuzberg.list_post_processors).to include('test')
Kreuzberg.unregister_post_processor('test')
end
it 'registers and validates' do
validator = ->(result) do
raise StandardError, 'Too short' if result['content'].length < 10
end
Kreuzberg.register_validator('test-val', validator)
expect(Kreuzberg.list_validators).to include('test-val')
Kreuzberg.unregister_validator('test-val')
end
end
```

View File

@@ -0,0 +1,14 @@
```ruby title="Ruby"
require "kreuzberg"
validator = lambda do |result|
raise StandardError, "Content too short" if result.content.length < 50
end
Kreuzberg.register_validator("min_length", validator, priority: 10)
result = Kreuzberg.extract_file_sync("document.pdf")
puts "Validated content length: #{result.content.length}"
Kreuzberg.unregister_validator("min_length")
```

View File

@@ -0,0 +1,35 @@
```ruby title="Ruby"
require "kreuzberg"
# Custom validator checking document quality score
class QualityScoreValidator
def initialize(min_score: 0.5)
@min_score = min_score
end
def call(result)
quality_score = result.quality_score || 0.0
if quality_score < @min_score
raise StandardError,
format("Quality score too low: %.2f < %.2f", quality_score, @min_score)
end
end
end
# Register with default minimum score of 0.5
validator = QualityScoreValidator.new(min_score: 0.5)
Kreuzberg.register_validator("quality_score_check", validator)
# Usage with quality processing enabled
config = Kreuzberg::ExtractionConfig.new(
enable_quality_processing: true
)
begin
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
puts "Document quality verified: #{result.quality_score}"
rescue StandardError => e
puts "Quality check failed: #{e.message}"
end
```

View File

@@ -0,0 +1,29 @@
```ruby title="Ruby"
require 'kreuzberg'
class StatefulPlugin
def initialize
@lock = Mutex.new
@count = 0
end
def call(result)
@lock.synchronize { @count += 1 }
result
end
def count
@lock.synchronize { @count }
end
end
plugin = StatefulPlugin.new
Kreuzberg.register_post_processor('counter', plugin)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Processed: #{plugin.count}"
```

View File

@@ -0,0 +1,7 @@
```ruby title="Ruby"
require 'kreuzberg'
Kreuzberg.unregister_post_processor('word_count')
Kreuzberg.unregister_validator('min_length_validator')
Kreuzberg.unregister_ocr_backend('cloud-ocr')
```

View File

@@ -0,0 +1,23 @@
```ruby title="Ruby"
require 'kreuzberg'
class WordCountProcessor
def call(result)
return result if result['content'].empty?
word_count = result['content'].split.length
result['metadata'] ||= {}
result['metadata']['word_count'] = word_count
result
end
end
processor = WordCountProcessor.new
Kreuzberg.register_post_processor('word_count', processor, 10)
config = Kreuzberg::ExtractionConfig.new(
postprocessor: { enabled: true }
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Words: #{result.metadata['word_count']}"
```