This commit is contained in:
22
docs/snippets/ruby/config/advanced_config.md
Normal file
22
docs/snippets/ruby/config/advanced_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu'
|
||||
),
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 100
|
||||
),
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new,
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
result.chunks&.each { |chunk| puts chunk[0..100] }
|
||||
puts "Languages: #{result.detected_languages.inspect}"
|
||||
```
|
||||
55
docs/snippets/ruby/config/chunking_config.md
Normal file
55
docs/snippets/ruby/config/chunking_config.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Markdown with Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
sizing_type: "tokenizer",
|
||||
sizing_model: "Xenova/gpt-4o"
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.md", config)
|
||||
|
||||
result.chunks.each do |chunk|
|
||||
if chunk.metadata.heading_context
|
||||
puts "Headings:"
|
||||
chunk.metadata.heading_context.headings.each do |heading|
|
||||
puts " #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
```ruby title="Ruby - Prepend Heading Context"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
chunker_type: "markdown",
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
prepend_heading_context: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file("document.md", config)
|
||||
|
||||
result.chunks.each do |chunk|
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
puts chunk.content[0, 100]
|
||||
end
|
||||
```
|
||||
10
docs/snippets/ruby/config/config_basic.md
Normal file
10
docs/snippets/ruby/config/config_basic.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
enable_quality_processing: true
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
```
|
||||
6
docs/snippets/ruby/config/config_discover.md
Normal file
6
docs/snippets/ruby/config/config_discover.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.discover
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
```
|
||||
11
docs/snippets/ruby/config/config_ocr.md
Normal file
11
docs/snippets/ruby/config/config_ocr.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+fra',
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
|
||||
)
|
||||
)
|
||||
```
|
||||
19
docs/snippets/ruby/config/config_programmatic.md
Normal file
19
docs/snippets/ruby/config/config_programmatic.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```ruby title="Ruby"
|
||||
require "kreuzberg"
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
use_cache: true,
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract: Kreuzberg::TesseractConfig.new(psm: 6)
|
||||
),
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync("document.pdf", config)
|
||||
puts "Content length: #{result.content.length}"
|
||||
```
|
||||
15
docs/snippets/ruby/config/document_structure_config.md
Normal file
15
docs/snippets/ruby/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```ruby title="Document Structure Config (Ruby)"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(include_document_structure: true)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
if result.document
|
||||
result.document['nodes'].each do |node|
|
||||
node_type = node['content']['node_type']
|
||||
text = node['content']['text'] || ''
|
||||
puts "[#{node_type}] #{text[0...80]}"
|
||||
end
|
||||
end
|
||||
```
|
||||
31
docs/snippets/ruby/config/element_based_output.md
Normal file
31
docs/snippets/ruby/config/element_based_output.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```ruby title="Element-Based Output (Ruby)"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Configure element-based output
|
||||
config = Kreuzberg::ExtractionConfig.new(output_format: 'element_based')
|
||||
|
||||
# Extract document
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
|
||||
|
||||
# Access elements
|
||||
result.elements.each do |element|
|
||||
puts "Type: #{element.element_type}"
|
||||
puts "Text: #{element.text[0...100]}"
|
||||
|
||||
puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
|
||||
|
||||
if element.metadata.coordinates
|
||||
coords = element.metadata.coordinates
|
||||
puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
|
||||
end
|
||||
|
||||
puts "---"
|
||||
end
|
||||
|
||||
# Filter by element type
|
||||
titles = result.elements.select { |e| e.element_type == 'title' }
|
||||
titles.each do |title|
|
||||
level = title.metadata.additional['level'] || 'unknown'
|
||||
puts "[#{level}] #{title.text}"
|
||||
end
|
||||
```
|
||||
18
docs/snippets/ruby/config/embedding_config.md
Normal file
18
docs/snippets/ruby/config/embedding_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1000,
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: Kreuzberg::EmbeddingModelType.new(
|
||||
type: 'preset',
|
||||
name: 'all-mpnet-base-v2'
|
||||
),
|
||||
batch_size: 16,
|
||||
normalize: true,
|
||||
show_download_progress: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
89
docs/snippets/ruby/config/embedding_config.rb
Normal file
89
docs/snippets/ruby/config/embedding_config.rb
Normal file
@@ -0,0 +1,89 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Preset model (recommended)
|
||||
# Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: :preset, name: "balanced" },
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: "~/.cache/kreuzberg/embeddings"
|
||||
)
|
||||
|
||||
# Available presets:
|
||||
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
# - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
# - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
# Example 2: Custom ONNX model (requires embeddings feature)
|
||||
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: {
|
||||
type: :custom,
|
||||
model_id: "BAAI/bge-small-en-v1.5",
|
||||
dimensions: 384
|
||||
},
|
||||
batch_size: 32,
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: nil # Uses default: .kreuzberg/embeddings/
|
||||
)
|
||||
|
||||
# Popular ONNX-compatible models:
|
||||
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
# Example 3: Alternative Custom ONNX Model
|
||||
# For advanced users wanting different ONNX embedding models.
|
||||
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||||
model: {
|
||||
type: :custom,
|
||||
model_id: "sentence-transformers/all-mpnet-base-v2",
|
||||
dimensions: 768
|
||||
},
|
||||
batch_size: 16, # Larger model requires smaller batch size
|
||||
normalize: true,
|
||||
show_download_progress: true,
|
||||
cache_dir: "/var/cache/embeddings"
|
||||
)
|
||||
|
||||
|
||||
# Integration with ChunkingConfig
|
||||
# Add embeddings to your chunking configuration:
|
||||
chunking_config = Kreuzberg::ChunkingConfig.new(
|
||||
max_characters: 1024,
|
||||
overlap: 100,
|
||||
preset: "balanced",
|
||||
embedding: Kreuzberg::EmbeddingConfig.new(
|
||||
model: { type: :preset, name: "balanced" },
|
||||
batch_size: 32,
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
|
||||
extraction_config = Kreuzberg::ExtractionConfig.new(
|
||||
chunking: chunking_config
|
||||
)
|
||||
|
||||
|
||||
# Key parameter explanations:
|
||||
#
|
||||
# batch_size: Number of texts to embed at once (32-128 typical)
|
||||
# - Larger batches are faster but use more memory
|
||||
# - Smaller batches for resource-constrained environments
|
||||
#
|
||||
# normalize: Whether to normalize vectors (L2 norm)
|
||||
# - true (recommended): Enables cosine similarity in vector DBs
|
||||
# - false: Raw embedding values
|
||||
#
|
||||
# cache_dir: Where to store downloaded models
|
||||
# - nil: Uses .kreuzberg/embeddings/ in current directory
|
||||
# - String: Custom directory for model storage
|
||||
#
|
||||
# show_download_progress: Display download progress bar
|
||||
# - Useful for monitoring large model downloads
|
||||
87
docs/snippets/ruby/config/hierarchy_config.rb
Normal file
87
docs/snippets/ruby/config/hierarchy_config.rb
Normal file
@@ -0,0 +1,87 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Basic hierarchy extraction
|
||||
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
# Extract bounding box information for spatial layout awareness.
|
||||
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6, # Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox: true, # Include bounding box coordinates
|
||||
ocr_coverage_threshold: nil # No OCR coverage threshold
|
||||
)
|
||||
|
||||
pdf_config_basic = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_basic
|
||||
)
|
||||
|
||||
extraction_config_basic = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_basic
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
|
||||
|
||||
|
||||
# Example 2: Custom k_clusters for minimal structure
|
||||
# Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
# Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 3, # Minimal clustering: just 3 levels
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: nil
|
||||
)
|
||||
|
||||
pdf_config_minimal = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_minimal
|
||||
)
|
||||
|
||||
extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_minimal
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
|
||||
|
||||
|
||||
# Example 3: With OCR coverage threshold
|
||||
# Trigger OCR if less than 50% of text has font data.
|
||||
# Useful for documents with mixed digital and scanned content.
|
||||
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.5 # Trigger OCR if text coverage < 50%
|
||||
)
|
||||
|
||||
pdf_config_ocr = Kreuzberg::PdfConfig.new(
|
||||
hierarchy: hierarchy_config_ocr
|
||||
)
|
||||
|
||||
extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: pdf_config_ocr
|
||||
)
|
||||
|
||||
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
|
||||
|
||||
|
||||
# Field descriptions:
|
||||
#
|
||||
# enabled: boolean (default: true)
|
||||
# - Enable or disable hierarchy extraction
|
||||
# - When false, hierarchy structure is not analyzed
|
||||
#
|
||||
# k_clusters: integer (default: 6, valid: 1-7)
|
||||
# - Number of font size clusters for hierarchy levels
|
||||
# - 6 provides H1-H6 heading levels with body text
|
||||
# - Higher values create more fine-grained hierarchy
|
||||
# - Lower values create simpler structure
|
||||
#
|
||||
# include_bbox: boolean (default: true)
|
||||
# - Include bounding box coordinates in hierarchy blocks
|
||||
# - Required for spatial layout awareness and document structure
|
||||
# - Set to false only if space optimization is critical
|
||||
#
|
||||
# ocr_coverage_threshold: float | nil (default: nil)
|
||||
# - Range: 0.0 to 1.0
|
||||
# - Triggers OCR when text block coverage falls below this fraction
|
||||
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
# - nil means no OCR coverage-based triggering
|
||||
14
docs/snippets/ruby/config/html_output.md
Normal file
14
docs/snippets/ruby/config/html_output.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
output_format: 'html',
|
||||
html_output: Kreuzberg::HtmlOutputConfig.new(
|
||||
theme: 'git_hub',
|
||||
embed_css: true
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('document.pdf', nil, config)
|
||||
puts result.content # HTML with kb-* classes
|
||||
```
|
||||
65
docs/snippets/ruby/config/keyword_config.rb
Normal file
65
docs/snippets/ruby/config/keyword_config.rb
Normal file
@@ -0,0 +1,65 @@
|
||||
require 'kreuzberg'
|
||||
|
||||
# Example 1: Basic YAKE configuration
|
||||
# Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
def basic_yake
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :yake,
|
||||
max_keywords: 10,
|
||||
min_score: 0.0,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en',
|
||||
yake_params: nil,
|
||||
rake_params: nil
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
# Example 2: Advanced YAKE with custom parameters
|
||||
# Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
def advanced_yake
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :yake,
|
||||
max_keywords: 15,
|
||||
min_score: 0.1,
|
||||
ngram_range: [1, 2],
|
||||
language: 'en',
|
||||
yake_params: Kreuzberg::YakeParams.new(
|
||||
window_size: 1
|
||||
),
|
||||
rake_params: nil
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
# Example 3: RAKE configuration
|
||||
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
def rake_config
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: :rake,
|
||||
max_keywords: 10,
|
||||
min_score: 5.0,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en',
|
||||
yake_params: nil,
|
||||
rake_params: Kreuzberg::RakeParams.new(
|
||||
min_word_length: 1,
|
||||
max_words_per_phrase: 3
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file('document.pdf', config: config)
|
||||
puts "Keywords: #{result.keywords}"
|
||||
end
|
||||
|
||||
basic_yake if __FILE__ == $0
|
||||
13
docs/snippets/ruby/config/keyword_extraction_config.md
Normal file
13
docs/snippets/ruby/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
keywords: Kreuzberg::KeywordConfig.new(
|
||||
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
|
||||
max_keywords: 10,
|
||||
min_score: 0.3,
|
||||
ngram_range: [1, 3],
|
||||
language: 'en'
|
||||
)
|
||||
)
|
||||
```
|
||||
11
docs/snippets/ruby/config/language_detection_config.md
Normal file
11
docs/snippets/ruby/config/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
language_detection: Kreuzberg::LanguageDetectionConfig.new(
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false
|
||||
)
|
||||
)
|
||||
```
|
||||
10
docs/snippets/ruby/config/ocr_dpi_config.md
Normal file
10
docs/snippets/ruby/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(backend: 'tesseract'),
|
||||
pdf: Kreuzberg::PdfConfig.new(dpi: 300)
|
||||
)
|
||||
|
||||
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
|
||||
```
|
||||
16
docs/snippets/ruby/config/pdf_config.md
Normal file
16
docs/snippets/ruby/config/pdf_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
extract_metadata: true,
|
||||
passwords: ['password1', 'password2'],
|
||||
hierarchy: Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
29
docs/snippets/ruby/config/pdf_hierarchy_config.md
Normal file
29
docs/snippets/ruby/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
# Using keyword arguments with defaults
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
hierarchy: Kreuzberg::HierarchyConfig.new(
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Using hash syntax alternative
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
pdf_options: Kreuzberg::PdfConfig.new(
|
||||
extract_images: true,
|
||||
hierarchy: {
|
||||
enabled: true,
|
||||
k_clusters: 6,
|
||||
include_bbox: true,
|
||||
ocr_coverage_threshold: 0.8
|
||||
}
|
||||
)
|
||||
)
|
||||
```
|
||||
11
docs/snippets/ruby/config/postprocessor_config.md
Normal file
11
docs/snippets/ruby/config/postprocessor_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
postprocessor: Kreuzberg::PostProcessorConfig.new(
|
||||
enabled: true,
|
||||
enabled_processors: ['deduplication', 'whitespace_normalization'],
|
||||
disabled_processors: ['mojibake_fix']
|
||||
)
|
||||
)
|
||||
```
|
||||
7
docs/snippets/ruby/config/quality_processing_config.md
Normal file
7
docs/snippets/ruby/config/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
enable_quality_processing: true
|
||||
)
|
||||
```
|
||||
16
docs/snippets/ruby/config/tesseract_config.md
Normal file
16
docs/snippets/ruby/config/tesseract_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
ocr: Kreuzberg::OcrConfig.new(
|
||||
language: 'eng+fra+deu',
|
||||
tesseract_config: Kreuzberg::TesseractConfig.new(
|
||||
psm: 6,
|
||||
oem: 1,
|
||||
min_confidence: 0.8,
|
||||
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
|
||||
enable_table_detection: true
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
12
docs/snippets/ruby/config/token_reduction_config.md
Normal file
12
docs/snippets/ruby/config/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```ruby title="Ruby"
|
||||
require 'kreuzberg'
|
||||
|
||||
config = Kreuzberg::ExtractionConfig.new(
|
||||
token_reduction: Kreuzberg::TokenReductionConfig.new(
|
||||
mode: 'moderate',
|
||||
preserve_markdown: true,
|
||||
preserve_code: true,
|
||||
language_hint: 'eng'
|
||||
)
|
||||
)
|
||||
```
|
||||
Reference in New Issue
Block a user