This commit is contained in:
901
crates/kreuzberg/tests/stopwords_integration_test.rs
Normal file
901
crates/kreuzberg/tests/stopwords_integration_test.rs
Normal file
@@ -0,0 +1,901 @@
|
||||
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
||||
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
||||
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
||||
//! rewritten against the public extraction surface.
|
||||
|
||||
#![cfg(any())]
|
||||
|
||||
// Original content preserved below; recompiled once gating cfg drops.
|
||||
// Disabled by the file-level cfg(any()) above.
|
||||
|
||||
/*
|
||||
//! Integration tests for stopwords with token reduction and keywords extraction.
|
||||
#![cfg(all(feature = "stopwords", feature = "quality"))]
|
||||
//!
|
||||
//! These tests verify that stopwords are properly integrated across different features:
|
||||
//! - Token reduction at all ReductionLevels
|
||||
//! - Keywords extraction (YAKE and RAKE algorithms)
|
||||
//! - CJK text processing
|
||||
//! - Multi-language documents
|
||||
//! - Language fallback mechanisms
|
||||
//! - Custom stopwords
|
||||
|
||||
use kreuzberg::stopwords::{STOPWORDS, get_stopwords, get_stopwords_with_fallback};
|
||||
use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
|
||||
|
||||
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
||||
use kreuzberg::keywords::{KeywordConfig, extract_keywords};
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn count_stopwords(text: &str, lang: &str) -> usize {
|
||||
let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
|
||||
let words: Vec<&str> = text.split_whitespace().collect();
|
||||
|
||||
words
|
||||
.iter()
|
||||
.filter(|word| {
|
||||
let clean = word
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
|
||||
!clean.is_empty() && stopwords.contains(&clean)
|
||||
})
|
||||
.count()
|
||||
}
|
||||
|
||||
fn extract_content_words(text: &str, lang: &str) -> Vec<String> {
|
||||
let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
|
||||
let words: Vec<&str> = text.split_whitespace().collect();
|
||||
|
||||
words
|
||||
.iter()
|
||||
.filter_map(|word| {
|
||||
let clean = word
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
|
||||
if !clean.is_empty() && !stopwords.contains(&clean) && clean.len() > 1 {
|
||||
Some(clean)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_removed_during_moderate_token_reduction() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
language_hint: Some("en".to_string()),
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
|
||||
assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
|
||||
assert!(!result.contains(" and "), "Should remove 'and'. Result: {}", result);
|
||||
|
||||
assert!(result.contains("quick"), "Should preserve 'quick'. Result: {}", result);
|
||||
assert!(result.contains("brown"), "Should preserve 'brown'. Result: {}", result);
|
||||
assert!(result.contains("fox"), "Should preserve 'fox'. Result: {}", result);
|
||||
assert!(
|
||||
result.contains("jumping"),
|
||||
"Should preserve 'jumping'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(result.contains("lazy"), "Should preserve 'lazy'. Result: {}", result);
|
||||
|
||||
let original_stopwords = count_stopwords(input, "en");
|
||||
let result_stopwords = count_stopwords(&result, "en");
|
||||
|
||||
assert!(
|
||||
result_stopwords < original_stopwords,
|
||||
"Result should have fewer stopwords than original. Original: {}, Result: {}",
|
||||
original_stopwords,
|
||||
result_stopwords
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_across_reduction_levels() {
|
||||
let text = "The machine learning model is trained on the large dataset and achieves good performance";
|
||||
|
||||
let light_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Light,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
|
||||
|
||||
let light_stopwords = count_stopwords(&light_result, "en");
|
||||
assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
|
||||
|
||||
let moderate_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
|
||||
|
||||
let moderate_stopwords = count_stopwords(&moderate_result, "en");
|
||||
assert!(
|
||||
moderate_stopwords < light_stopwords,
|
||||
"Moderate reduction should remove more stopwords than light. Light: {}, Moderate: {}",
|
||||
light_stopwords,
|
||||
moderate_stopwords
|
||||
);
|
||||
|
||||
let aggressive_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Aggressive,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
aggressive_result.len() <= moderate_result.len(),
|
||||
"Aggressive reduction should be more aggressive than moderate"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_preserve_semantic_meaning() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input =
|
||||
"The artificial intelligence system is processing the natural language text for extracting meaningful insights";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
let content_words = extract_content_words(&result, "en");
|
||||
|
||||
assert!(
|
||||
content_words.contains(&"artificial".to_string()) || result.contains("artificial"),
|
||||
"Should preserve 'artificial'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
content_words.contains(&"intelligence".to_string()) || result.contains("intelligence"),
|
||||
"Should preserve 'intelligence'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
content_words.contains(&"processing".to_string()) || result.contains("processing"),
|
||||
"Should preserve 'processing'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
content_words.contains(&"natural".to_string()) || result.contains("natural"),
|
||||
"Should preserve 'natural'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
content_words.contains(&"language".to_string()) || result.contains("language"),
|
||||
"Should preserve 'language'. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_multiple_languages() {
|
||||
let en_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let en_input = "The computer science program is very comprehensive and includes many courses";
|
||||
let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
|
||||
|
||||
let en_original_stopwords = count_stopwords(en_input, "en");
|
||||
let en_result_stopwords = count_stopwords(&en_result, "en");
|
||||
assert!(
|
||||
en_result_stopwords < en_original_stopwords,
|
||||
"English stopwords should be removed"
|
||||
);
|
||||
|
||||
let es_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
|
||||
let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
|
||||
|
||||
let es_original_stopwords = count_stopwords(es_input, "es");
|
||||
let es_result_stopwords = count_stopwords(&es_result, "es");
|
||||
assert!(
|
||||
es_result_stopwords < es_original_stopwords,
|
||||
"Spanish stopwords should be removed"
|
||||
);
|
||||
|
||||
assert!(
|
||||
es_result.contains("programa") || es_result.contains("ciencias") || es_result.contains("computación"),
|
||||
"Should preserve Spanish content words. Result: {}",
|
||||
es_result
|
||||
);
|
||||
|
||||
let de_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
|
||||
let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
|
||||
|
||||
let de_original_stopwords = count_stopwords(de_input, "de");
|
||||
let de_result_stopwords = count_stopwords(&de_result, "de");
|
||||
assert!(
|
||||
de_result_stopwords < de_original_stopwords,
|
||||
"German stopwords should be removed"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_language_fallback_to_english_stopwords() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The system is processing the data with the algorithm";
|
||||
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
||||
|
||||
let original_stopwords = count_stopwords(input, "en");
|
||||
let result_stopwords = count_stopwords(&result, "en");
|
||||
|
||||
assert!(
|
||||
result_stopwords < original_stopwords,
|
||||
"Should fallback to English stopwords for unsupported language"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_stopwords_integration() {
|
||||
let mut custom_stopwords = HashMap::new();
|
||||
custom_stopwords.insert(
|
||||
"en".to_string(),
|
||||
vec!["algorithm".to_string(), "system".to_string(), "data".to_string()],
|
||||
);
|
||||
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
custom_stopwords: Some(custom_stopwords),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The algorithm processes the data in the system efficiently";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.contains("algorithm"),
|
||||
"Should remove custom stopword 'algorithm'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
!result.contains("system"),
|
||||
"Should remove custom stopword 'system'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
!result.contains("data"),
|
||||
"Should remove custom stopword 'data'. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("processes") || result.contains("efficiently"),
|
||||
"Should preserve non-stopword content. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_chinese_text() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "这个人工智能系统可以处理自然语言";
|
||||
let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.is_empty(),
|
||||
"Chinese text should be processed. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("人工") || result.contains("智能") || result.contains("语言"),
|
||||
"Should preserve important Chinese terms. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_mixed_cjk_english() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The machine learning model 机器学习模型 is processing data efficiently";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.contains(" the ") && !result.contains("The "),
|
||||
"Should remove English 'the'. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("machine") || result.contains("learning"),
|
||||
"Should preserve English content. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("机器") || result.contains("学习") || result.contains("模型"),
|
||||
"Should preserve Chinese content. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_japanese_text() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "人工知能技術の研究開発";
|
||||
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.is_empty(),
|
||||
"Japanese text should be processed. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_korean_text() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "인공 지능 기술 개발";
|
||||
let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.is_empty(),
|
||||
"Korean text should be processed. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
#[test]
|
||||
fn test_stopwords_excluded_from_rake_keywords() {
|
||||
let text = "The machine learning model is trained on a large dataset. \
|
||||
The model uses neural networks and deep learning algorithms. \
|
||||
The training process requires significant computational resources.";
|
||||
|
||||
let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
|
||||
|
||||
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords");
|
||||
|
||||
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
|
||||
|
||||
for keyword in &keywords {
|
||||
let words: Vec<&str> = keyword.text.split_whitespace().collect();
|
||||
|
||||
let all_stopwords = words.iter().all(|word| {
|
||||
let clean = word
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
en_stopwords.contains(&clean)
|
||||
});
|
||||
|
||||
assert!(
|
||||
!all_stopwords,
|
||||
"Keyword '{}' should not be composed entirely of stopwords",
|
||||
keyword.text
|
||||
);
|
||||
}
|
||||
|
||||
let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
|
||||
|
||||
assert!(
|
||||
keyword_texts.iter().any(|k| k.contains("machine learning")
|
||||
|| k.contains("neural networks")
|
||||
|| k.contains("deep learning")
|
||||
|| k.contains("dataset")
|
||||
|| k.contains("model")
|
||||
|| k.contains("training")),
|
||||
"Should extract meaningful technical keywords. Got: {:?}",
|
||||
keyword_texts
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
#[test]
|
||||
fn test_stopwords_excluded_from_yake_keywords() {
|
||||
let text = "Natural language processing enables computers to understand human language. \
|
||||
Deep learning models achieve state-of-the-art performance in text analysis. \
|
||||
These systems can extract meaningful information from large text corpora.";
|
||||
|
||||
let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
|
||||
|
||||
let keywords = extract_keywords(text, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords");
|
||||
|
||||
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
|
||||
|
||||
for keyword in &keywords {
|
||||
let has_content_word = keyword.text.split_whitespace().any(|word| {
|
||||
let clean = word
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
!clean.is_empty() && !en_stopwords.contains(&clean)
|
||||
});
|
||||
|
||||
assert!(
|
||||
has_content_word,
|
||||
"Keyword '{}' should contain at least one content word (non-stopword)",
|
||||
keyword.text
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
#[test]
|
||||
fn test_keywords_respect_language_specific_stopwords() {
|
||||
let spanish_text = "El aprendizaje automático es una rama de la inteligencia artificial. \
|
||||
Los modelos de aprendizaje profundo logran un rendimiento excepcional. \
|
||||
Estos sistemas pueden procesar grandes cantidades de datos.";
|
||||
|
||||
let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
|
||||
|
||||
let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
|
||||
|
||||
let es_stopwords = get_stopwords("es").expect("Spanish stopwords must exist");
|
||||
|
||||
for keyword in &keywords {
|
||||
let words: Vec<&str> = keyword.text.split_whitespace().collect();
|
||||
let all_stopwords = words.iter().all(|word| {
|
||||
let clean = word
|
||||
.chars()
|
||||
.filter(|c| c.is_alphabetic())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
es_stopwords.contains(&clean)
|
||||
});
|
||||
|
||||
assert!(
|
||||
!all_stopwords,
|
||||
"Spanish keyword '{}' should not be all stopwords",
|
||||
keyword.text
|
||||
);
|
||||
}
|
||||
|
||||
let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
|
||||
assert!(
|
||||
keyword_texts.iter().any(|k| k.contains("aprendizaje")
|
||||
|| k.contains("inteligencia")
|
||||
|| k.contains("modelos")
|
||||
|| k.contains("datos")),
|
||||
"Should extract meaningful Spanish keywords. Got: {:?}",
|
||||
keyword_texts
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_stopwords_text_reduction() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "the is a an and or but of to in for on at by";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
result.len() < input.len(),
|
||||
"Text of all stopwords should be significantly reduced"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_stopwords_text_reduction() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
let input_words: Vec<&str> = input.split_whitespace().collect();
|
||||
let result_lower = result.to_lowercase();
|
||||
|
||||
for word in input_words {
|
||||
let word_lower = word.to_lowercase();
|
||||
assert!(
|
||||
result_lower.contains(&word_lower),
|
||||
"Technical term '{}' should be preserved. Result: {}",
|
||||
word,
|
||||
result
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mixed_case_stopwords_removal() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
let result_words: Vec<&str> = result.split_whitespace().collect();
|
||||
assert!(
|
||||
!result_words.contains(&"the"),
|
||||
"Should remove lowercase 'the'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(
|
||||
!result_words.contains(&"is"),
|
||||
"Should remove lowercase 'is'. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("SYSTEM"),
|
||||
"Should preserve 'SYSTEM'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(result.contains("DATA"), "Should preserve 'DATA'. Result: {}", result);
|
||||
assert!(
|
||||
result.contains("ALGORITHM"),
|
||||
"Should preserve 'ALGORITHM'. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reduce_tokens_function_with_stopwords() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "The artificial intelligence system processes the natural language efficiently";
|
||||
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
let original_stopwords = count_stopwords(text, "en");
|
||||
let result_stopwords = count_stopwords(&result, "en");
|
||||
|
||||
assert!(
|
||||
result_stopwords < original_stopwords,
|
||||
"reduce_tokens should remove stopwords. Original: {}, Result: {}",
|
||||
original_stopwords,
|
||||
result_stopwords
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("artificial") || result.contains("intelligence"),
|
||||
"Should preserve content words. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_punctuation() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The system, which is processing the data, uses the algorithm.";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
|
||||
"Should remove most instances of 'the'. Result: {}",
|
||||
result
|
||||
);
|
||||
|
||||
assert!(
|
||||
result.contains("system") || result.contains("processing") || result.contains("algorithm"),
|
||||
"Should preserve content words. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_numbers() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The model has 100 layers and processes the data in 10 seconds";
|
||||
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
result.contains("100"),
|
||||
"Should preserve number '100'. Result: {}",
|
||||
result
|
||||
);
|
||||
assert!(result.contains("10"), "Should preserve number '10'. Result: {}", result);
|
||||
|
||||
assert!(
|
||||
result.contains("model") || result.contains("layers") || result.contains("processes"),
|
||||
"Should preserve content words. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_removal_consistency_across_calls() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The machine learning model is trained on the dataset";
|
||||
|
||||
let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert_eq!(result1, result2, "Results should be consistent across calls");
|
||||
assert_eq!(result2, result3, "Results should be consistent across calls");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_with_long_text() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
enable_parallel: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let paragraph = "The machine learning model is trained on the large dataset. \
|
||||
The training process uses the neural network architecture. \
|
||||
The system processes the data efficiently and achieves the best performance. ";
|
||||
let input = paragraph.repeat(10);
|
||||
|
||||
let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
result.len() < input.len(),
|
||||
"Long stopword-heavy text should be reduced. Input: {} chars, Result: {} chars",
|
||||
input.len(),
|
||||
result.len()
|
||||
);
|
||||
|
||||
let original_stopwords = count_stopwords(&input, "en");
|
||||
let result_stopwords = count_stopwords(&result, "en");
|
||||
|
||||
assert!(
|
||||
result_stopwords < original_stopwords,
|
||||
"Should remove stopwords from long text. Original: {}, Result: {}",
|
||||
original_stopwords,
|
||||
result_stopwords
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_stopwords_with_fallback_in_reduction() {
|
||||
let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
|
||||
assert!(primary_stopwords.is_some(), "Should fallback to English");
|
||||
|
||||
let en_stopwords = get_stopwords("en").expect("Operation failed");
|
||||
assert_eq!(
|
||||
primary_stopwords.expect("Operation failed").len(),
|
||||
en_stopwords.len(),
|
||||
"Fallback should return English stopwords"
|
||||
);
|
||||
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "The system is processing the data";
|
||||
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
!result.contains(" the ") && !result.contains(" is "),
|
||||
"Should use fallback stopwords. Result: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stopwords_registry_completeness() {
|
||||
assert_eq!(STOPWORDS.len(), 64, "Should have exactly 64 language stopword sets");
|
||||
|
||||
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
|
||||
assert!(en_stopwords.len() >= 70, "English should have at least 70 stopwords");
|
||||
|
||||
assert!(en_stopwords.contains("the"), "Should contain 'the'");
|
||||
assert!(en_stopwords.contains("is"), "Should contain 'is'");
|
||||
assert!(en_stopwords.contains("and"), "Should contain 'and'");
|
||||
assert!(en_stopwords.contains("a"), "Should contain 'a'");
|
||||
assert!(en_stopwords.contains("an"), "Should contain 'an'");
|
||||
assert!(en_stopwords.contains("of"), "Should contain 'of'");
|
||||
assert!(en_stopwords.contains("to"), "Should contain 'to'");
|
||||
assert!(en_stopwords.contains("in"), "Should contain 'in'");
|
||||
assert!(en_stopwords.contains("for"), "Should contain 'for'");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_token_reduction_handles_nan_threshold() {
|
||||
let mut config = TokenReductionConfig {
|
||||
level: ReductionLevel::Maximum,
|
||||
semantic_threshold: f32::NAN,
|
||||
enable_semantic_clustering: true,
|
||||
target_reduction: Some(0.5),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
config.language_hint = Some("en".to_string());
|
||||
let input = "Critical system update highlights performance improvements across distributed modules.";
|
||||
|
||||
let result = reduce_tokens(input, &config, Some("en")).unwrap_or_else(|_| String::new());
|
||||
assert!(
|
||||
result.chars().all(|c| !c.is_control()),
|
||||
"Result should not contain unexpected control characters"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_token_reduction_handles_multibyte_utf8() {
|
||||
let config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
language_hint: Some("ja".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
|
||||
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
|
||||
|
||||
assert!(
|
||||
result.contains("品質管理") || result.contains("漢字処理"),
|
||||
"Important multibyte terms should survive reduction: {}",
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_token_reduction_concurrent_access() {
|
||||
use std::sync::Arc;
|
||||
|
||||
let config = Arc::new(TokenReductionConfig {
|
||||
level: ReductionLevel::Aggressive,
|
||||
enable_parallel: true,
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let input = "Concurrent reduction ensures thread safety without deadlocks or panics.";
|
||||
|
||||
std::thread::scope(|scope| {
|
||||
for _ in 0..8 {
|
||||
let cfg = Arc::clone(&config);
|
||||
scope.spawn(move || {
|
||||
let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
|
||||
assert!(!reduced.is_empty());
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
#[test]
|
||||
fn demo_stopwords_effectiveness() {
|
||||
use kreuzberg::stopwords::get_stopwords;
|
||||
use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
|
||||
|
||||
let en_text = "The machine learning model is trained on the large dataset and achieves good performance";
|
||||
let en_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
|
||||
|
||||
println!("\n=== English Example ===");
|
||||
println!("BEFORE: {} chars", en_text.len());
|
||||
println!("{}", en_text);
|
||||
println!(
|
||||
"\nAFTER: {} chars ({}% reduction)",
|
||||
en_result.len(),
|
||||
100 - (en_result.len() * 100 / en_text.len())
|
||||
);
|
||||
println!("{}", en_result);
|
||||
|
||||
let zh_text = "这个人工智能系统可以处理自然语言";
|
||||
let zh_config = TokenReductionConfig {
|
||||
level: ReductionLevel::Moderate,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
|
||||
|
||||
println!("\n=== Chinese Example ===");
|
||||
println!("BEFORE: {}", zh_text);
|
||||
println!("AFTER: {}", zh_result);
|
||||
|
||||
let text = "The artificial intelligence system processes the natural language efficiently";
|
||||
|
||||
println!("\n=== Reduction Level Comparison ===");
|
||||
println!("ORIGINAL: {}", text);
|
||||
|
||||
for level in [
|
||||
ReductionLevel::Light,
|
||||
ReductionLevel::Moderate,
|
||||
ReductionLevel::Aggressive,
|
||||
] {
|
||||
let config = TokenReductionConfig {
|
||||
level,
|
||||
use_simd: false,
|
||||
..Default::default()
|
||||
};
|
||||
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
|
||||
println!(
|
||||
"{:?}: {} chars -> {} chars ({}% reduction)",
|
||||
level,
|
||||
text.len(),
|
||||
result.len(),
|
||||
100 - (result.len() * 100 / text.len())
|
||||
);
|
||||
println!(" {}", result);
|
||||
}
|
||||
|
||||
let stopwords = get_stopwords("en").expect("Operation failed");
|
||||
println!("\n=== Stopwords Stats ===");
|
||||
println!("English stopwords: {}", stopwords.len());
|
||||
println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
*/
|
||||
Reference in New Issue
Block a user