This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
//! Regression tests for issue #797: chunking preset must not auto-inject embeddings.
|
||||
//!
|
||||
//! When a `ChunkingConfig` carries a `preset` but no explicit `embedding`, the
|
||||
//! extraction pipeline must leave every `chunk.embedding` as `None`. Before
|
||||
//! the fix, `resolve_preset()` silently injected an `EmbeddingConfig`, causing
|
||||
//! `generate_embeddings_for_chunks()` to run and populate embeddings that the
|
||||
//! caller never requested.
|
||||
|
||||
#[cfg(feature = "chunking")]
|
||||
mod preset_no_embedding {
|
||||
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
|
||||
use kreuzberg::core::extractor::extract_bytes;
|
||||
|
||||
/// A preset with no explicit `EmbeddingConfig` must not produce chunk embeddings.
|
||||
#[tokio::test]
|
||||
async fn test_preset_without_embedding_config_produces_no_embeddings() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
preset: Some("multilingual".to_string()),
|
||||
embedding: None,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Small plain-text document — forces at least one chunk even with large defaults.
|
||||
let text = b"Hello world. This is a short document used to verify that preset-based \
|
||||
chunking does not unexpectedly generate embeddings.";
|
||||
|
||||
let result = extract_bytes(text, "text/plain", &config)
|
||||
.await
|
||||
.expect("extraction should succeed");
|
||||
|
||||
let chunks = result
|
||||
.chunks
|
||||
.expect("chunks should be produced when chunking is configured");
|
||||
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
assert!(
|
||||
chunk.embedding.is_none(),
|
||||
"chunk[{i}] should have no embedding when no EmbeddingConfig was supplied (#797)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression guard: no preset, no embedding — chunks must still have no embeddings.
|
||||
#[tokio::test]
|
||||
async fn test_no_preset_no_embedding_produces_no_embeddings() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 50,
|
||||
overlap: 10,
|
||||
preset: None,
|
||||
embedding: None,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = b"Short text that will be chunked without any embedding configuration.";
|
||||
|
||||
let result = extract_bytes(text, "text/plain", &config)
|
||||
.await
|
||||
.expect("extraction should succeed");
|
||||
|
||||
let chunks = result.chunks.expect("chunks should be produced");
|
||||
|
||||
for (i, chunk) in chunks.iter().enumerate() {
|
||||
assert!(
|
||||
chunk.embedding.is_none(),
|
||||
"chunk[{i}] must have no embedding when embedding is not configured"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user