Files
fil/docs/snippets/csharp/config/embedding_config.cs

107 lines
3.6 KiB
C#
Raw Normal View History

2026-06-01 23:40:55 +02:00
using Kreuzberg.Config;
public class EmbeddingConfigExample
{
public static void Main()
{
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
var embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "~/.cache/kreuzberg/embeddings"
};
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "BAAI/bge-small-en-v1.5",
Dimensions = 384
},
BatchSize = 32,
Normalize = true,
ShowDownloadProgress = true,
CacheDir = null // Uses default: .kreuzberg/embeddings/
};
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = new EmbeddingConfig
{
Model = new EmbeddingModelType.Custom
{
ModelId = "sentence-transformers/all-mpnet-base-v2",
Dimensions = 768
},
BatchSize = 16, // Larger model requires smaller batch size
Normalize = true,
ShowDownloadProgress = true,
CacheDir = "/var/cache/embeddings"
};
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
var chunkingConfig = new ChunkingConfig
{
MaxChars = 1024,
MaxOverlap = 100,
Preset = "balanced",
Embedding = new EmbeddingConfig
{
Model = new EmbeddingModelType.Preset
{
Name = "balanced"
},
BatchSize = 32,
Normalize = true
}
};
var extractionConfig = new ExtractionConfig
{
Chunking = chunkingConfig
};
}
}
// Key parameter explanations:
//
// BatchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// Normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// CacheDir: Where to store downloaded models
// - null: Uses .kreuzberg/embeddings/ in current directory
// - String path: Custom directory for model storage
//
// ShowDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads