This commit is contained in:
6
docs/snippets/ruby/plugins/clear_plugins.md
Normal file
6
docs/snippets/ruby/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
Kreuzberg.clear_post_processors
|
||||
Kreuzberg.clear_validators
|
||||
```
|
||||
44
docs/snippets/ruby/plugins/embedding_backend.md
Normal file
44
docs/snippets/ruby/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Wrap an already-loaded embedder so kreuzberg can call back into it during
|
||||
# chunking and standalone embed requests. The Ruby object must respond to
|
||||
# `dimensions` and `embed`; `version`, `initialize`, and `shutdown` are
|
||||
# optional lifecycle hooks.
|
||||
class MyEmbedder
|
||||
def version
|
||||
'1.0.0'
|
||||
end
|
||||
|
||||
def initialize_plugin
|
||||
# Optional warm-up; runs once at registration.
|
||||
end
|
||||
|
||||
def shutdown
|
||||
# Optional cleanup.
|
||||
end
|
||||
|
||||
# Captured once at registration; the dispatcher uses this for shape validation.
|
||||
def dimensions
|
||||
768
|
||||
end
|
||||
|
||||
def embed(texts)
|
||||
# Delegate to the already-loaded host model.
|
||||
texts.map { Array.new(768, 0.0) }
|
||||
end
|
||||
end
|
||||
|
||||
# Register once at startup. The second argument is the plugin name used to
|
||||
# reference the backend from EmbeddingConfig.
|
||||
Kreuzberg.register_embedding_backend(MyEmbedder.new, 'my-embedder')
|
||||
|
||||
config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: 'plugin', name: 'my-embedder' },
|
||||
# Optional: bound the wait on a hung backend (default 60s; nil disables).
|
||||
max_embed_duration_secs: 30
|
||||
)
|
||||
|
||||
vectors = Kreuzberg.embed_texts(['Hello, world!', 'Second text'], config: config)
|
||||
puts "Generated #{vectors.length} vectors"
|
||||
```
|
||||
15
docs/snippets/ruby/plugins/extractor_registration.md
Normal file
15
docs/snippets/ruby/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Register custom extractor with priority 50
|
||||
Kreuzberg.register_document_extractor(
|
||||
name: "custom-json-extractor",
|
||||
extractor: ->(content, mime_type, config) {
|
||||
JSON.parse(content.to_s)
|
||||
},
|
||||
priority: 50
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.json")
|
||||
puts "Extracted content length: #{result.content.length}"
|
||||
```
|
||||
11
docs/snippets/ruby/plugins/list_plugins.md
Normal file
11
docs/snippets/ruby/plugins/list_plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
processors = Kreuzberg.list_post_processors
|
||||
validators = Kreuzberg.list_validators
|
||||
backends = Kreuzberg.list_ocr_backends
|
||||
|
||||
puts "Post-processors: #{processors.inspect}"
|
||||
puts "Validators: #{validators.inspect}"
|
||||
puts "OCR backends: #{backends.inspect}"
|
||||
```
|
||||
32
docs/snippets/ruby/plugins/min_length_validator.md
Normal file
32
docs/snippets/ruby/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
# Custom validator ensuring extracted content meets minimum length requirement
|
||||
class MinLengthValidator
|
||||
def initialize(min_length: 100)
|
||||
@min_length = min_length
|
||||
end
|
||||
|
||||
def call(result)
|
||||
content_length = result["content"].length
|
||||
if content_length < @min_length
|
||||
raise StandardError,
|
||||
"Content too short: #{content_length} < #{@min_length}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register the validator with priority
|
||||
validator = MinLengthValidator.new(min_length: 100)
|
||||
Kreuzberg.register_validator("min_length_validator", validator, 100)
|
||||
|
||||
# Usage in extraction
|
||||
config = Kreuzberg::ExtractionConfig.new
|
||||
|
||||
begin
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
puts "Extraction successful: #{result["content"].length} characters"
|
||||
rescue StandardError => e
|
||||
puts "Validation failed: #{e.message}"
|
||||
end
|
||||
```
|
||||
27
docs/snippets/ruby/plugins/pdf_metadata_extractor.md
Normal file
27
docs/snippets/ruby/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class PdfMetadataExtractor
|
||||
def initialize
|
||||
@count = 0
|
||||
end
|
||||
|
||||
def call(result)
|
||||
return result unless result['mime_type'] == 'application/pdf'
|
||||
@count += 1
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['pdf_order'] = @count
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
extractor = PdfMetadataExtractor.new
|
||||
Kreuzberg.register_post_processor('pdf_metadata', extractor)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('report.pdf', config: config)
|
||||
puts "Metadata: #{result.metadata.inspect}"
|
||||
```
|
||||
22
docs/snippets/ruby/plugins/pdf_only_processor.md
Normal file
22
docs/snippets/ruby/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class PdfOnlyProcessor
|
||||
def call(result)
|
||||
return result unless result['mime_type'] == 'application/pdf'
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['pdf_processed'] = true
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
processor = PdfOnlyProcessor.new
|
||||
Kreuzberg.register_post_processor('pdf_only', processor)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Metadata: #{result.metadata.inspect}"
|
||||
```
|
||||
23
docs/snippets/ruby/plugins/plugin_extractor.md
Normal file
23
docs/snippets/ruby/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class CustomPostProcessor
|
||||
def call(result)
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['processed_by'] = 'CustomPostProcessor'
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
class CustomValidator
|
||||
def call(result)
|
||||
raise StandardError, 'Empty' if result['content'].empty?
|
||||
end
|
||||
end
|
||||
|
||||
processor = CustomPostProcessor.new
|
||||
validator = CustomValidator.new
|
||||
|
||||
Kreuzberg.register_post_processor('custom', processor)
|
||||
Kreuzberg.register_validator('custom', validator)
|
||||
```
|
||||
29
docs/snippets/ruby/plugins/plugin_logging.md
Normal file
29
docs/snippets/ruby/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'logger'
|
||||
|
||||
logger = Logger.new($stdout)
|
||||
|
||||
class LoggingPostProcessor
|
||||
def call(result)
|
||||
puts "Processing: #{result['mime_type']}"
|
||||
puts "Content: #{result['content'].length} bytes"
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
class LoggingValidator
|
||||
def call(result)
|
||||
puts "Validating: #{result['content'].length} bytes"
|
||||
raise StandardError, 'Too short' if result['content'].length < 50
|
||||
end
|
||||
end
|
||||
|
||||
processor = LoggingPostProcessor.new
|
||||
validator = LoggingValidator.new
|
||||
|
||||
Kreuzberg.register_post_processor('logging-proc', processor)
|
||||
Kreuzberg.register_validator('logging-val', validator)
|
||||
|
||||
logger.info('Plugins registered')
|
||||
```
|
||||
22
docs/snippets/ruby/plugins/plugin_testing.md
Normal file
22
docs/snippets/ruby/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
require 'rspec'
|
||||
|
||||
describe 'Plugin Testing' do
|
||||
it 'registers and calls post-processor' do
|
||||
processor = ->(result) { result['metadata'] ||= {}; result }
|
||||
Kreuzberg.register_post_processor('test', processor)
|
||||
expect(Kreuzberg.list_post_processors).to include('test')
|
||||
Kreuzberg.unregister_post_processor('test')
|
||||
end
|
||||
|
||||
it 'registers and validates' do
|
||||
validator = ->(result) do
|
||||
raise StandardError, 'Too short' if result['content'].length < 10
|
||||
end
|
||||
Kreuzberg.register_validator('test-val', validator)
|
||||
expect(Kreuzberg.list_validators).to include('test-val')
|
||||
Kreuzberg.unregister_validator('test-val')
|
||||
end
|
||||
end
|
||||
```
|
||||
14
docs/snippets/ruby/plugins/plugin_validator.md
Normal file
14
docs/snippets/ruby/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
validator = lambda do |result|
|
||||
raise StandardError, "Content too short" if result.content.length < 50
|
||||
end
|
||||
|
||||
Kreuzberg.register_validator("min_length", validator, priority: 10)
|
||||
|
||||
result = Kreuzberg.extract_file_sync("document.pdf")
|
||||
puts "Validated content length: #{result.content.length}"
|
||||
|
||||
Kreuzberg.unregister_validator("min_length")
|
||||
```
|
||||
35
docs/snippets/ruby/plugins/quality_score_validator.md
Normal file
35
docs/snippets/ruby/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
# Custom validator checking document quality score
|
||||
class QualityScoreValidator
|
||||
def initialize(min_score: 0.5)
|
||||
@min_score = min_score
|
||||
end
|
||||
|
||||
def call(result)
|
||||
quality_score = result.quality_score || 0.0
|
||||
|
||||
if quality_score < @min_score
|
||||
raise StandardError,
|
||||
format("Quality score too low: %.2f < %.2f", quality_score, @min_score)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Register with default minimum score of 0.5
|
||||
validator = QualityScoreValidator.new(min_score: 0.5)
|
||||
Kreuzberg.register_validator("quality_score_check", validator)
|
||||
|
||||
# Usage with quality processing enabled
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
begin
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config: config)
|
||||
puts "Document quality verified: #{result.quality_score}"
|
||||
rescue StandardError => e
|
||||
puts "Quality check failed: #{e.message}"
|
||||
end
|
||||
```
|
||||
29
docs/snippets/ruby/plugins/stateful_plugin.md
Normal file
29
docs/snippets/ruby/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class StatefulPlugin
|
||||
def initialize
|
||||
@lock = Mutex.new
|
||||
@count = 0
|
||||
end
|
||||
|
||||
def call(result)
|
||||
@lock.synchronize { @count += 1 }
|
||||
result
|
||||
end
|
||||
|
||||
def count
|
||||
@lock.synchronize { @count }
|
||||
end
|
||||
end
|
||||
|
||||
plugin = StatefulPlugin.new
|
||||
Kreuzberg.register_post_processor('counter', plugin)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Processed: #{plugin.count}"
|
||||
```
|
||||
7
docs/snippets/ruby/plugins/unregister_plugins.md
Normal file
7
docs/snippets/ruby/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
Kreuzberg.unregister_post_processor('word_count')
|
||||
Kreuzberg.unregister_validator('min_length_validator')
|
||||
Kreuzberg.unregister_ocr_backend('cloud-ocr')
|
||||
```
|
||||
23
docs/snippets/ruby/plugins/word_count_processor.md
Normal file
23
docs/snippets/ruby/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
class WordCountProcessor
|
||||
def call(result)
|
||||
return result if result['content'].empty?
|
||||
word_count = result['content'].split.length
|
||||
result['metadata'] ||= {}
|
||||
result['metadata']['word_count'] = word_count
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
processor = WordCountProcessor.new
|
||||
Kreuzberg.register_post_processor('word_count', processor, 10)
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: { enabled: true }
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
puts "Words: #{result.metadata['word_count']}"
|
||||
```
|
||||
Reference in New Issue
Block a user