Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/ruby/config/embedding_config.rb
+++ b/docs/snippets/ruby/config/embedding_config.rb
@@ -0,0 +1,89 @@
+require 'kreuzberg'
+
+# Example 1: Preset model (recommended)
+# Fast, balanced, or quality preset configurations optimized for common use cases.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: { type: :preset, name: "balanced" },
+  batch_size: 32,
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: "~/.cache/kreuzberg/embeddings"
+)
+
+# Available presets:
+# - "fast" (384 dims): Quick prototyping, development, resource-constrained
+# - "balanced" (768 dims): Production, general-purpose RAG, English documents
+# - "quality" (1024 dims): Complex documents, maximum accuracy
+# - "multilingual" (768 dims): International documents, 100+ languages
+
+
+# Example 2: Custom ONNX model (requires embeddings feature)
+# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: {
+    type: :custom,
+    model_id: "BAAI/bge-small-en-v1.5",
+    dimensions: 384
+  },
+  batch_size: 32,
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: nil  # Uses default: .kreuzberg/embeddings/
+)
+
+# Popular ONNX-compatible models:
+# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
+# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
+# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
+# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
+
+
+# Example 3: Alternative Custom ONNX Model
+# For advanced users wanting different ONNX embedding models.
+embedding_config = Kreuzberg::EmbeddingConfig.new(
+  model: {
+    type: :custom,
+    model_id: "sentence-transformers/all-mpnet-base-v2",
+    dimensions: 768
+  },
+  batch_size: 16,  # Larger model requires smaller batch size
+  normalize: true,
+  show_download_progress: true,
+  cache_dir: "/var/cache/embeddings"
+)
+
+
+# Integration with ChunkingConfig
+# Add embeddings to your chunking configuration:
+chunking_config = Kreuzberg::ChunkingConfig.new(
+  max_characters: 1024,
+  overlap: 100,
+  preset: "balanced",
+  embedding: Kreuzberg::EmbeddingConfig.new(
+    model: { type: :preset, name: "balanced" },
+    batch_size: 32,
+    normalize: true
+  )
+)
+
+extraction_config = Kreuzberg::ExtractionConfig.new(
+  chunking: chunking_config
+)
+
+
+# Key parameter explanations:
+#
+# batch_size: Number of texts to embed at once (32-128 typical)
+#   - Larger batches are faster but use more memory
+#   - Smaller batches for resource-constrained environments
+#
+# normalize: Whether to normalize vectors (L2 norm)
+#   - true (recommended): Enables cosine similarity in vector DBs
+#   - false: Raw embedding values
+#
+# cache_dir: Where to store downloaded models
+#   - nil: Uses .kreuzberg/embeddings/ in current directory
+#   - String: Custom directory for model storage
+#
+# show_download_progress: Display download progress bar
+#   - Useful for monitoring large model downloads