88 lines
3.6 KiB
Java
88 lines
3.6 KiB
Java
|
|
import kreuzberg.config.EmbeddingConfig;
|
||
|
|
import kreuzberg.config.EmbeddingModelType;
|
||
|
|
import kreuzberg.config.ChunkingConfig;
|
||
|
|
import kreuzberg.config.ExtractionConfig;
|
||
|
|
|
||
|
|
public class EmbeddingConfigExample {
|
||
|
|
public static void main(String[] args) {
|
||
|
|
// Example 1: Preset model (recommended)
|
||
|
|
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||
|
|
EmbeddingConfig embeddingConfig = EmbeddingConfig.builder()
|
||
|
|
.model(EmbeddingModelType.preset("balanced"))
|
||
|
|
.batchSize(32)
|
||
|
|
.normalize(true)
|
||
|
|
.showDownloadProgress(true)
|
||
|
|
.cacheDir("~/.cache/kreuzberg/embeddings")
|
||
|
|
.build();
|
||
|
|
|
||
|
|
// Available presets:
|
||
|
|
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||
|
|
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||
|
|
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||
|
|
// - "multilingual" (768 dims): International documents, 100+ languages
|
||
|
|
|
||
|
|
|
||
|
|
// Example 2: Custom ONNX model (requires embeddings feature)
|
||
|
|
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||
|
|
embeddingConfig = EmbeddingConfig.builder()
|
||
|
|
.model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384))
|
||
|
|
.batchSize(32)
|
||
|
|
.normalize(true)
|
||
|
|
.showDownloadProgress(true)
|
||
|
|
.cacheDir(null) // Uses default: .kreuzberg/embeddings/
|
||
|
|
.build();
|
||
|
|
|
||
|
|
// Popular ONNX-compatible models:
|
||
|
|
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||
|
|
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||
|
|
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||
|
|
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||
|
|
|
||
|
|
|
||
|
|
// Example 3: Alternative Custom ONNX Model
|
||
|
|
// For advanced users wanting different ONNX embedding models.
|
||
|
|
embeddingConfig = EmbeddingConfig.builder()
|
||
|
|
.model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768))
|
||
|
|
.batchSize(16) // Larger model requires smaller batch size
|
||
|
|
.normalize(true)
|
||
|
|
.showDownloadProgress(true)
|
||
|
|
.cacheDir("/var/cache/embeddings")
|
||
|
|
.build();
|
||
|
|
|
||
|
|
|
||
|
|
// Integration with ChunkingConfig
|
||
|
|
// Add embeddings to your chunking configuration:
|
||
|
|
ChunkingConfig chunkingConfig = ChunkingConfig.builder()
|
||
|
|
.maxChars(1024)
|
||
|
|
.maxOverlap(100)
|
||
|
|
.preset("balanced")
|
||
|
|
.embedding(EmbeddingConfig.builder()
|
||
|
|
.model(EmbeddingModelType.preset("balanced"))
|
||
|
|
.batchSize(32)
|
||
|
|
.normalize(true)
|
||
|
|
.build())
|
||
|
|
.build();
|
||
|
|
|
||
|
|
ExtractionConfig extractionConfig = ExtractionConfig.builder()
|
||
|
|
.chunking(chunkingConfig)
|
||
|
|
.build();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Key parameter explanations:
|
||
|
|
//
|
||
|
|
// batchSize: Number of texts to embed at once (32-128 typical)
|
||
|
|
// - Larger batches are faster but use more memory
|
||
|
|
// - Smaller batches for resource-constrained environments
|
||
|
|
//
|
||
|
|
// normalize: Whether to normalize vectors (L2 norm)
|
||
|
|
// - true (recommended): Enables cosine similarity in vector DBs
|
||
|
|
// - false: Raw embedding values
|
||
|
|
//
|
||
|
|
// cacheDir: Where to store downloaded models
|
||
|
|
// - null: Uses .kreuzberg/embeddings/ in current directory
|
||
|
|
// - String path: Custom directory for model storage
|
||
|
|
//
|
||
|
|
// showDownloadProgress: Display download progress bar
|
||
|
|
// - Useful for monitoring large model downloads
|