import kreuzberg.config.EmbeddingConfig; import kreuzberg.config.EmbeddingModelType; import kreuzberg.config.ChunkingConfig; import kreuzberg.config.ExtractionConfig; public class EmbeddingConfigExample { public static void main(String[] args) { // Example 1: Preset model (recommended) // Fast, balanced, or quality preset configurations optimized for common use cases. EmbeddingConfig embeddingConfig = EmbeddingConfig.builder() .model(EmbeddingModelType.preset("balanced")) .batchSize(32) .normalize(true) .showDownloadProgress(true) .cacheDir("~/.cache/kreuzberg/embeddings") .build(); // Available presets: // - "fast" (384 dims): Quick prototyping, development, resource-constrained // - "balanced" (768 dims): Production, general-purpose RAG, English documents // - "quality" (1024 dims): Complex documents, maximum accuracy // - "multilingual" (768 dims): International documents, 100+ languages // Example 2: Custom ONNX model (requires embeddings feature) // Direct access to specific ONNX embedding models from HuggingFace with custom dimensions. embeddingConfig = EmbeddingConfig.builder() .model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384)) .batchSize(32) .normalize(true) .showDownloadProgress(true) .cacheDir(null) // Uses default: .kreuzberg/embeddings/ .build(); // Popular ONNX-compatible models: // - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient // - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed // - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower // - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support // Example 3: Alternative Custom ONNX Model // For advanced users wanting different ONNX embedding models. embeddingConfig = EmbeddingConfig.builder() .model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768)) .batchSize(16) // Larger model requires smaller batch size .normalize(true) .showDownloadProgress(true) .cacheDir("/var/cache/embeddings") .build(); // Integration with ChunkingConfig // Add embeddings to your chunking configuration: ChunkingConfig chunkingConfig = ChunkingConfig.builder() .maxChars(1024) .maxOverlap(100) .preset("balanced") .embedding(EmbeddingConfig.builder() .model(EmbeddingModelType.preset("balanced")) .batchSize(32) .normalize(true) .build()) .build(); ExtractionConfig extractionConfig = ExtractionConfig.builder() .chunking(chunkingConfig) .build(); } } // Key parameter explanations: // // batchSize: Number of texts to embed at once (32-128 typical) // - Larger batches are faster but use more memory // - Smaller batches for resource-constrained environments // // normalize: Whether to normalize vectors (L2 norm) // - true (recommended): Enables cosine similarity in vector DBs // - false: Raw embedding values // // cacheDir: Where to store downloaded models // - null: Uses .kreuzberg/embeddings/ in current directory // - String path: Custom directory for model storage // // showDownloadProgress: Display download progress bar // - Useful for monitoring large model downloads