76 lines
2.8 KiB
Rust
76 lines
2.8 KiB
Rust
|
|
//! Regression tests for issue #797: chunking preset must not auto-inject embeddings.
|
||
|
|
//!
|
||
|
|
//! When a `ChunkingConfig` carries a `preset` but no explicit `embedding`, the
|
||
|
|
//! extraction pipeline must leave every `chunk.embedding` as `None`. Before
|
||
|
|
//! the fix, `resolve_preset()` silently injected an `EmbeddingConfig`, causing
|
||
|
|
//! `generate_embeddings_for_chunks()` to run and populate embeddings that the
|
||
|
|
//! caller never requested.
|
||
|
|
|
||
|
|
#[cfg(feature = "chunking")]
|
||
|
|
mod preset_no_embedding {
|
||
|
|
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
|
||
|
|
use kreuzberg::core::extractor::extract_bytes;
|
||
|
|
|
||
|
|
/// A preset with no explicit `EmbeddingConfig` must not produce chunk embeddings.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_preset_without_embedding_config_produces_no_embeddings() {
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
chunking: Some(ChunkingConfig {
|
||
|
|
preset: Some("multilingual".to_string()),
|
||
|
|
embedding: None,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
// Small plain-text document — forces at least one chunk even with large defaults.
|
||
|
|
let text = b"Hello world. This is a short document used to verify that preset-based \
|
||
|
|
chunking does not unexpectedly generate embeddings.";
|
||
|
|
|
||
|
|
let result = extract_bytes(text, "text/plain", &config)
|
||
|
|
.await
|
||
|
|
.expect("extraction should succeed");
|
||
|
|
|
||
|
|
let chunks = result
|
||
|
|
.chunks
|
||
|
|
.expect("chunks should be produced when chunking is configured");
|
||
|
|
|
||
|
|
for (i, chunk) in chunks.iter().enumerate() {
|
||
|
|
assert!(
|
||
|
|
chunk.embedding.is_none(),
|
||
|
|
"chunk[{i}] should have no embedding when no EmbeddingConfig was supplied (#797)"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression guard: no preset, no embedding — chunks must still have no embeddings.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_no_preset_no_embedding_produces_no_embeddings() {
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
chunking: Some(ChunkingConfig {
|
||
|
|
max_characters: 50,
|
||
|
|
overlap: 10,
|
||
|
|
preset: None,
|
||
|
|
embedding: None,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let text = b"Short text that will be chunked without any embedding configuration.";
|
||
|
|
|
||
|
|
let result = extract_bytes(text, "text/plain", &config)
|
||
|
|
.await
|
||
|
|
.expect("extraction should succeed");
|
||
|
|
|
||
|
|
let chunks = result.chunks.expect("chunks should be produced");
|
||
|
|
|
||
|
|
for (i, chunk) in chunks.iter().enumerate() {
|
||
|
|
assert!(
|
||
|
|
chunk.embedding.is_none(),
|
||
|
|
"chunk[{i}] must have no embedding when embedding is not configured"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|