fil/docs/snippets/ruby/config/embedding_config.rb

require 'kreuzberg'

# Example 1: Preset model (recommended)
# Fast, balanced, or quality preset configurations optimized for common use cases.
embedding_config = Kreuzberg::EmbeddingConfig.new(
  model: { type: :preset, name: "balanced" },
  batch_size: 32,
  normalize: true,
  show_download_progress: true,
  cache_dir: "~/.cache/kreuzberg/embeddings"
)

# Available presets:
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
# - "quality" (1024 dims): Complex documents, maximum accuracy
# - "multilingual" (768 dims): International documents, 100+ languages


# Example 2: Custom ONNX model (requires embeddings feature)
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embedding_config = Kreuzberg::EmbeddingConfig.new(
  model: {
    type: :custom,
    model_id: "BAAI/bge-small-en-v1.5",
    dimensions: 384
  },
  batch_size: 32,
  normalize: true,
  show_download_progress: true,
  cache_dir: nil  # Uses default: .kreuzberg/embeddings/
)

# Popular ONNX-compatible models:
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support


# Example 3: Alternative Custom ONNX Model
# For advanced users wanting different ONNX embedding models.
embedding_config = Kreuzberg::EmbeddingConfig.new(
  model: {
    type: :custom,
    model_id: "sentence-transformers/all-mpnet-base-v2",
    dimensions: 768
  },
  batch_size: 16,  # Larger model requires smaller batch size
  normalize: true,
  show_download_progress: true,
  cache_dir: "/var/cache/embeddings"
)


# Integration with ChunkingConfig
# Add embeddings to your chunking configuration:
chunking_config = Kreuzberg::ChunkingConfig.new(
  max_characters: 1024,
  overlap: 100,
  preset: "balanced",
  embedding: Kreuzberg::EmbeddingConfig.new(
    model: { type: :preset, name: "balanced" },
    batch_size: 32,
    normalize: true
  )
)

extraction_config = Kreuzberg::ExtractionConfig.new(
  chunking: chunking_config
)


# Key parameter explanations:
#
# batch_size: Number of texts to embed at once (32-128 typical)
#   - Larger batches are faster but use more memory
#   - Smaller batches for resource-constrained environments
#
# normalize: Whether to normalize vectors (L2 norm)
#   - true (recommended): Enables cosine similarity in vector DBs
#   - false: Raw embedding values
#
# cache_dir: Where to store downloaded models
#   - nil: Uses .kreuzberg/embeddings/ in current directory
#   - String: Custom directory for model storage
#
# show_download_progress: Display download progress bar
#   - Useful for monitoring large model downloads