90 lines
2.8 KiB
Ruby
90 lines
2.8 KiB
Ruby
|
|
require 'kreuzberg'
|
||
|
|
|
||
|
|
# Example 1: Preset model (recommended)
|
||
|
|
# Fast, balanced, or quality preset configurations optimized for common use cases.
|
||
|
|
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||
|
|
model: { type: :preset, name: "balanced" },
|
||
|
|
batch_size: 32,
|
||
|
|
normalize: true,
|
||
|
|
show_download_progress: true,
|
||
|
|
cache_dir: "~/.cache/kreuzberg/embeddings"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Available presets:
|
||
|
|
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||
|
|
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||
|
|
# - "quality" (1024 dims): Complex documents, maximum accuracy
|
||
|
|
# - "multilingual" (768 dims): International documents, 100+ languages
|
||
|
|
|
||
|
|
|
||
|
|
# Example 2: Custom ONNX model (requires embeddings feature)
|
||
|
|
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||
|
|
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||
|
|
model: {
|
||
|
|
type: :custom,
|
||
|
|
model_id: "BAAI/bge-small-en-v1.5",
|
||
|
|
dimensions: 384
|
||
|
|
},
|
||
|
|
batch_size: 32,
|
||
|
|
normalize: true,
|
||
|
|
show_download_progress: true,
|
||
|
|
cache_dir: nil # Uses default: .kreuzberg/embeddings/
|
||
|
|
)
|
||
|
|
|
||
|
|
# Popular ONNX-compatible models:
|
||
|
|
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||
|
|
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||
|
|
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||
|
|
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||
|
|
|
||
|
|
|
||
|
|
# Example 3: Alternative Custom ONNX Model
|
||
|
|
# For advanced users wanting different ONNX embedding models.
|
||
|
|
embedding_config = Kreuzberg::EmbeddingConfig.new(
|
||
|
|
model: {
|
||
|
|
type: :custom,
|
||
|
|
model_id: "sentence-transformers/all-mpnet-base-v2",
|
||
|
|
dimensions: 768
|
||
|
|
},
|
||
|
|
batch_size: 16, # Larger model requires smaller batch size
|
||
|
|
normalize: true,
|
||
|
|
show_download_progress: true,
|
||
|
|
cache_dir: "/var/cache/embeddings"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# Integration with ChunkingConfig
|
||
|
|
# Add embeddings to your chunking configuration:
|
||
|
|
chunking_config = Kreuzberg::ChunkingConfig.new(
|
||
|
|
max_characters: 1024,
|
||
|
|
overlap: 100,
|
||
|
|
preset: "balanced",
|
||
|
|
embedding: Kreuzberg::EmbeddingConfig.new(
|
||
|
|
model: { type: :preset, name: "balanced" },
|
||
|
|
batch_size: 32,
|
||
|
|
normalize: true
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
extraction_config = Kreuzberg::ExtractionConfig.new(
|
||
|
|
chunking: chunking_config
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# Key parameter explanations:
|
||
|
|
#
|
||
|
|
# batch_size: Number of texts to embed at once (32-128 typical)
|
||
|
|
# - Larger batches are faster but use more memory
|
||
|
|
# - Smaller batches for resource-constrained environments
|
||
|
|
#
|
||
|
|
# normalize: Whether to normalize vectors (L2 norm)
|
||
|
|
# - true (recommended): Enables cosine similarity in vector DBs
|
||
|
|
# - false: Raw embedding values
|
||
|
|
#
|
||
|
|
# cache_dir: Where to store downloaded models
|
||
|
|
# - nil: Uses .kreuzberg/embeddings/ in current directory
|
||
|
|
# - String: Custom directory for model storage
|
||
|
|
#
|
||
|
|
# show_download_progress: Display download progress bar
|
||
|
|
# - Useful for monitoring large model downloads
|