Files
fil/crates/kreuzberg/tests/issue_797_preset_embedding_regression.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

76 lines
2.8 KiB
Rust

//! Regression tests for issue #797: chunking preset must not auto-inject embeddings.
//!
//! When a `ChunkingConfig` carries a `preset` but no explicit `embedding`, the
//! extraction pipeline must leave every `chunk.embedding` as `None`. Before
//! the fix, `resolve_preset()` silently injected an `EmbeddingConfig`, causing
//! `generate_embeddings_for_chunks()` to run and populate embeddings that the
//! caller never requested.
#[cfg(feature = "chunking")]
mod preset_no_embedding {
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
use kreuzberg::core::extractor::extract_bytes;
/// A preset with no explicit `EmbeddingConfig` must not produce chunk embeddings.
#[tokio::test]
async fn test_preset_without_embedding_config_produces_no_embeddings() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
preset: Some("multilingual".to_string()),
embedding: None,
..Default::default()
}),
..Default::default()
};
// Small plain-text document — forces at least one chunk even with large defaults.
let text = b"Hello world. This is a short document used to verify that preset-based \
chunking does not unexpectedly generate embeddings.";
let result = extract_bytes(text, "text/plain", &config)
.await
.expect("extraction should succeed");
let chunks = result
.chunks
.expect("chunks should be produced when chunking is configured");
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.embedding.is_none(),
"chunk[{i}] should have no embedding when no EmbeddingConfig was supplied (#797)"
);
}
}
/// Regression guard: no preset, no embedding — chunks must still have no embeddings.
#[tokio::test]
async fn test_no_preset_no_embedding_produces_no_embeddings() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 50,
overlap: 10,
preset: None,
embedding: None,
..Default::default()
}),
..Default::default()
};
let text = b"Short text that will be chunked without any embedding configuration.";
let result = extract_bytes(text, "text/plain", &config)
.await
.expect("extraction should succeed");
let chunks = result.chunks.expect("chunks should be produced");
for (i, chunk) in chunks.iter().enumerate() {
assert!(
chunk.embedding.is_none(),
"chunk[{i}] must have no embedding when embedding is not configured"
);
}
}
}