//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises //! pub(crate) APIs that the migration deliberately narrowed; gated until //! either (a) these APIs are re-exposed publicly, or (b) the test is //! rewritten against the public extraction surface. #![cfg(any())] // Original content preserved below; recompiled once gating cfg drops. // Disabled by the file-level cfg(any()) above. /* //! Integration tests for stopwords with token reduction and keywords extraction. #![cfg(all(feature = "stopwords", feature = "quality"))] //! //! These tests verify that stopwords are properly integrated across different features: //! - Token reduction at all ReductionLevels //! - Keywords extraction (YAKE and RAKE algorithms) //! - CJK text processing //! - Multi-language documents //! - Language fallback mechanisms //! - Custom stopwords use kreuzberg::stopwords::{STOPWORDS, get_stopwords, get_stopwords_with_fallback}; use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens}; #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))] use kreuzberg::keywords::{KeywordConfig, extract_keywords}; use std::collections::HashMap; fn count_stopwords(text: &str, lang: &str) -> usize { let stopwords = get_stopwords(lang).expect("Stopwords must exist for language"); let words: Vec<&str> = text.split_whitespace().collect(); words .iter() .filter(|word| { let clean = word .chars() .filter(|c| c.is_alphabetic()) .collect::() .to_lowercase(); !clean.is_empty() && stopwords.contains(&clean) }) .count() } fn extract_content_words(text: &str, lang: &str) -> Vec { let stopwords = get_stopwords(lang).expect("Stopwords must exist for language"); let words: Vec<&str> = text.split_whitespace().collect(); words .iter() .filter_map(|word| { let clean = word .chars() .filter(|c| c.is_alphabetic()) .collect::() .to_lowercase(); if !clean.is_empty() && !stopwords.contains(&clean) && clean.len() > 1 { Some(clean) } else { None } }) .collect() } #[test] fn test_stopwords_removed_during_moderate_token_reduction() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, language_hint: Some("en".to_string()), use_simd: false, ..Default::default() }; let input = "The quick brown fox is jumping over the lazy dog and running through the forest"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result); assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result); assert!(!result.contains(" and "), "Should remove 'and'. Result: {}", result); assert!(result.contains("quick"), "Should preserve 'quick'. Result: {}", result); assert!(result.contains("brown"), "Should preserve 'brown'. Result: {}", result); assert!(result.contains("fox"), "Should preserve 'fox'. Result: {}", result); assert!( result.contains("jumping"), "Should preserve 'jumping'. Result: {}", result ); assert!(result.contains("lazy"), "Should preserve 'lazy'. Result: {}", result); let original_stopwords = count_stopwords(input, "en"); let result_stopwords = count_stopwords(&result, "en"); assert!( result_stopwords < original_stopwords, "Result should have fewer stopwords than original. Original: {}, Result: {}", original_stopwords, result_stopwords ); } #[test] fn test_stopwords_across_reduction_levels() { let text = "The machine learning model is trained on the large dataset and achieves good performance"; let light_config = TokenReductionConfig { level: ReductionLevel::Light, use_simd: false, ..Default::default() }; let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed"); let light_stopwords = count_stopwords(&light_result, "en"); assert!(light_stopwords > 0, "Light reduction should preserve some stopwords"); let moderate_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed"); let moderate_stopwords = count_stopwords(&moderate_result, "en"); assert!( moderate_stopwords < light_stopwords, "Moderate reduction should remove more stopwords than light. Light: {}, Moderate: {}", light_stopwords, moderate_stopwords ); let aggressive_config = TokenReductionConfig { level: ReductionLevel::Aggressive, use_simd: false, ..Default::default() }; let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed"); assert!( aggressive_result.len() <= moderate_result.len(), "Aggressive reduction should be more aggressive than moderate" ); } #[test] fn test_stopwords_preserve_semantic_meaning() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The artificial intelligence system is processing the natural language text for extracting meaningful insights"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); let content_words = extract_content_words(&result, "en"); assert!( content_words.contains(&"artificial".to_string()) || result.contains("artificial"), "Should preserve 'artificial'. Result: {}", result ); assert!( content_words.contains(&"intelligence".to_string()) || result.contains("intelligence"), "Should preserve 'intelligence'. Result: {}", result ); assert!( content_words.contains(&"processing".to_string()) || result.contains("processing"), "Should preserve 'processing'. Result: {}", result ); assert!( content_words.contains(&"natural".to_string()) || result.contains("natural"), "Should preserve 'natural'. Result: {}", result ); assert!( content_words.contains(&"language".to_string()) || result.contains("language"), "Should preserve 'language'. Result: {}", result ); } #[test] fn test_stopwords_with_multiple_languages() { let en_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let en_input = "The computer science program is very comprehensive and includes many courses"; let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed"); let en_original_stopwords = count_stopwords(en_input, "en"); let en_result_stopwords = count_stopwords(&en_result, "en"); assert!( en_result_stopwords < en_original_stopwords, "English stopwords should be removed" ); let es_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos"; let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed"); let es_original_stopwords = count_stopwords(es_input, "es"); let es_result_stopwords = count_stopwords(&es_result, "es"); assert!( es_result_stopwords < es_original_stopwords, "Spanish stopwords should be removed" ); assert!( es_result.contains("programa") || es_result.contains("ciencias") || es_result.contains("computación"), "Should preserve Spanish content words. Result: {}", es_result ); let de_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik"; let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed"); let de_original_stopwords = count_stopwords(de_input, "de"); let de_result_stopwords = count_stopwords(&de_result, "de"); assert!( de_result_stopwords < de_original_stopwords, "German stopwords should be removed" ); } #[test] fn test_language_fallback_to_english_stopwords() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The system is processing the data with the algorithm"; let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed"); let original_stopwords = count_stopwords(input, "en"); let result_stopwords = count_stopwords(&result, "en"); assert!( result_stopwords < original_stopwords, "Should fallback to English stopwords for unsupported language" ); } #[test] fn test_custom_stopwords_integration() { let mut custom_stopwords = HashMap::new(); custom_stopwords.insert( "en".to_string(), vec!["algorithm".to_string(), "system".to_string(), "data".to_string()], ); let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, custom_stopwords: Some(custom_stopwords), ..Default::default() }; let input = "The algorithm processes the data in the system efficiently"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!( !result.contains("algorithm"), "Should remove custom stopword 'algorithm'. Result: {}", result ); assert!( !result.contains("system"), "Should remove custom stopword 'system'. Result: {}", result ); assert!( !result.contains("data"), "Should remove custom stopword 'data'. Result: {}", result ); assert!( result.contains("processes") || result.contains("efficiently"), "Should preserve non-stopword content. Result: {}", result ); } #[test] fn test_stopwords_with_chinese_text() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "这个人工智能系统可以处理自然语言"; let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed"); assert!( !result.is_empty(), "Chinese text should be processed. Result: {}", result ); assert!( result.contains("人工") || result.contains("智能") || result.contains("语言"), "Should preserve important Chinese terms. Result: {}", result ); } #[test] fn test_stopwords_with_mixed_cjk_english() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The machine learning model 机器学习模型 is processing data efficiently"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!( !result.contains(" the ") && !result.contains("The "), "Should remove English 'the'. Result: {}", result ); assert!( result.contains("machine") || result.contains("learning"), "Should preserve English content. Result: {}", result ); assert!( result.contains("机器") || result.contains("学习") || result.contains("模型"), "Should preserve Chinese content. Result: {}", result ); } #[test] fn test_stopwords_with_japanese_text() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "人工知能技術の研究開発"; let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed"); assert!( !result.is_empty(), "Japanese text should be processed. Result: {}", result ); } #[test] fn test_stopwords_with_korean_text() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "인공 지능 기술 개발"; let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed"); assert!( !result.is_empty(), "Korean text should be processed. Result: {}", result ); } #[cfg(feature = "keywords-rake")] #[test] fn test_stopwords_excluded_from_rake_keywords() { let text = "The machine learning model is trained on a large dataset. \ The model uses neural networks and deep learning algorithms. \ The training process requires significant computational resources."; let config = KeywordConfig::rake().with_language("en").with_max_keywords(10); let keywords = extract_keywords(text, &config).expect("Operation failed"); assert!(!keywords.is_empty(), "Should extract keywords"); let en_stopwords = get_stopwords("en").expect("English stopwords must exist"); for keyword in &keywords { let words: Vec<&str> = keyword.text.split_whitespace().collect(); let all_stopwords = words.iter().all(|word| { let clean = word .chars() .filter(|c| c.is_alphabetic()) .collect::() .to_lowercase(); en_stopwords.contains(&clean) }); assert!( !all_stopwords, "Keyword '{}' should not be composed entirely of stopwords", keyword.text ); } let keyword_texts: Vec = keywords.iter().map(|k| k.text.to_lowercase()).collect(); assert!( keyword_texts.iter().any(|k| k.contains("machine learning") || k.contains("neural networks") || k.contains("deep learning") || k.contains("dataset") || k.contains("model") || k.contains("training")), "Should extract meaningful technical keywords. Got: {:?}", keyword_texts ); } #[cfg(feature = "keywords-yake")] #[test] fn test_stopwords_excluded_from_yake_keywords() { let text = "Natural language processing enables computers to understand human language. \ Deep learning models achieve state-of-the-art performance in text analysis. \ These systems can extract meaningful information from large text corpora."; let config = KeywordConfig::yake().with_language("en").with_max_keywords(10); let keywords = extract_keywords(text, &config).expect("Operation failed"); assert!(!keywords.is_empty(), "Should extract keywords"); let en_stopwords = get_stopwords("en").expect("English stopwords must exist"); for keyword in &keywords { let has_content_word = keyword.text.split_whitespace().any(|word| { let clean = word .chars() .filter(|c| c.is_alphabetic()) .collect::() .to_lowercase(); !clean.is_empty() && !en_stopwords.contains(&clean) }); assert!( has_content_word, "Keyword '{}' should contain at least one content word (non-stopword)", keyword.text ); } } #[cfg(feature = "keywords-rake")] #[test] fn test_keywords_respect_language_specific_stopwords() { let spanish_text = "El aprendizaje automático es una rama de la inteligencia artificial. \ Los modelos de aprendizaje profundo logran un rendimiento excepcional. \ Estos sistemas pueden procesar grandes cantidades de datos."; let config = KeywordConfig::rake().with_language("es").with_max_keywords(8); let keywords = extract_keywords(spanish_text, &config).expect("Operation failed"); assert!(!keywords.is_empty(), "Should extract Spanish keywords"); let es_stopwords = get_stopwords("es").expect("Spanish stopwords must exist"); for keyword in &keywords { let words: Vec<&str> = keyword.text.split_whitespace().collect(); let all_stopwords = words.iter().all(|word| { let clean = word .chars() .filter(|c| c.is_alphabetic()) .collect::() .to_lowercase(); es_stopwords.contains(&clean) }); assert!( !all_stopwords, "Spanish keyword '{}' should not be all stopwords", keyword.text ); } let keyword_texts: Vec = keywords.iter().map(|k| k.text.to_lowercase()).collect(); assert!( keyword_texts.iter().any(|k| k.contains("aprendizaje") || k.contains("inteligencia") || k.contains("modelos") || k.contains("datos")), "Should extract meaningful Spanish keywords. Got: {:?}", keyword_texts ); } #[test] fn test_all_stopwords_text_reduction() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "the is a an and or but of to in for on at by"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!( result.len() < input.len(), "Text of all stopwords should be significantly reduced" ); } #[test] fn test_no_stopwords_text_reduction() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); let input_words: Vec<&str> = input.split_whitespace().collect(); let result_lower = result.to_lowercase(); for word in input_words { let word_lower = word.to_lowercase(); assert!( result_lower.contains(&word_lower), "Technical term '{}' should be preserved. Result: {}", word, result ); } } #[test] fn test_mixed_case_stopwords_removal() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The SYSTEM Is Processing The DATA With The ALGORITHM"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); let result_words: Vec<&str> = result.split_whitespace().collect(); assert!( !result_words.contains(&"the"), "Should remove lowercase 'the'. Result: {}", result ); assert!( !result_words.contains(&"is"), "Should remove lowercase 'is'. Result: {}", result ); assert!( result.contains("SYSTEM"), "Should preserve 'SYSTEM'. Result: {}", result ); assert!(result.contains("DATA"), "Should preserve 'DATA'. Result: {}", result); assert!( result.contains("ALGORITHM"), "Should preserve 'ALGORITHM'. Result: {}", result ); } #[test] fn test_reduce_tokens_function_with_stopwords() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let text = "The artificial intelligence system processes the natural language efficiently"; let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed"); let original_stopwords = count_stopwords(text, "en"); let result_stopwords = count_stopwords(&result, "en"); assert!( result_stopwords < original_stopwords, "reduce_tokens should remove stopwords. Original: {}, Result: {}", original_stopwords, result_stopwords ); assert!( result.contains("artificial") || result.contains("intelligence"), "Should preserve content words. Result: {}", result ); } #[test] fn test_stopwords_with_punctuation() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The system, which is processing the data, uses the algorithm."; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!( !result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3, "Should remove most instances of 'the'. Result: {}", result ); assert!( result.contains("system") || result.contains("processing") || result.contains("algorithm"), "Should preserve content words. Result: {}", result ); } #[test] fn test_stopwords_with_numbers() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The model has 100 layers and processes the data in 10 seconds"; let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert!( result.contains("100"), "Should preserve number '100'. Result: {}", result ); assert!(result.contains("10"), "Should preserve number '10'. Result: {}", result); assert!( result.contains("model") || result.contains("layers") || result.contains("processes"), "Should preserve content words. Result: {}", result ); } #[test] fn test_stopwords_removal_consistency_across_calls() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The machine learning model is trained on the dataset"; let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed"); assert_eq!(result1, result2, "Results should be consistent across calls"); assert_eq!(result2, result3, "Results should be consistent across calls"); } #[test] fn test_stopwords_with_long_text() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, enable_parallel: false, ..Default::default() }; let paragraph = "The machine learning model is trained on the large dataset. \ The training process uses the neural network architecture. \ The system processes the data efficiently and achieves the best performance. "; let input = paragraph.repeat(10); let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed"); assert!( result.len() < input.len(), "Long stopword-heavy text should be reduced. Input: {} chars, Result: {} chars", input.len(), result.len() ); let original_stopwords = count_stopwords(&input, "en"); let result_stopwords = count_stopwords(&result, "en"); assert!( result_stopwords < original_stopwords, "Should remove stopwords from long text. Original: {}, Result: {}", original_stopwords, result_stopwords ); } #[test] fn test_get_stopwords_with_fallback_in_reduction() { let primary_stopwords = get_stopwords_with_fallback("xyz", "en"); assert!(primary_stopwords.is_some(), "Should fallback to English"); let en_stopwords = get_stopwords("en").expect("Operation failed"); assert_eq!( primary_stopwords.expect("Operation failed").len(), en_stopwords.len(), "Fallback should return English stopwords" ); let config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let input = "The system is processing the data"; let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed"); assert!( !result.contains(" the ") && !result.contains(" is "), "Should use fallback stopwords. Result: {}", result ); } #[test] fn test_stopwords_registry_completeness() { assert_eq!(STOPWORDS.len(), 64, "Should have exactly 64 language stopword sets"); let en_stopwords = get_stopwords("en").expect("English stopwords must exist"); assert!(en_stopwords.len() >= 70, "English should have at least 70 stopwords"); assert!(en_stopwords.contains("the"), "Should contain 'the'"); assert!(en_stopwords.contains("is"), "Should contain 'is'"); assert!(en_stopwords.contains("and"), "Should contain 'and'"); assert!(en_stopwords.contains("a"), "Should contain 'a'"); assert!(en_stopwords.contains("an"), "Should contain 'an'"); assert!(en_stopwords.contains("of"), "Should contain 'of'"); assert!(en_stopwords.contains("to"), "Should contain 'to'"); assert!(en_stopwords.contains("in"), "Should contain 'in'"); assert!(en_stopwords.contains("for"), "Should contain 'for'"); } #[test] fn test_token_reduction_handles_nan_threshold() { let mut config = TokenReductionConfig { level: ReductionLevel::Maximum, semantic_threshold: f32::NAN, enable_semantic_clustering: true, target_reduction: Some(0.5), ..Default::default() }; config.language_hint = Some("en".to_string()); let input = "Critical system update highlights performance improvements across distributed modules."; let result = reduce_tokens(input, &config, Some("en")).unwrap_or_else(|_| String::new()); assert!( result.chars().all(|c| !c.is_control()), "Result should not contain unexpected control characters" ); } #[test] fn test_token_reduction_handles_multibyte_utf8() { let config = TokenReductionConfig { level: ReductionLevel::Moderate, language_hint: Some("ja".to_string()), ..Default::default() }; let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。"; let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed"); assert!( result.contains("品質管理") || result.contains("漢字処理"), "Important multibyte terms should survive reduction: {}", result ); } #[test] fn test_token_reduction_concurrent_access() { use std::sync::Arc; let config = Arc::new(TokenReductionConfig { level: ReductionLevel::Aggressive, enable_parallel: true, ..Default::default() }); let input = "Concurrent reduction ensures thread safety without deadlocks or panics."; std::thread::scope(|scope| { for _ in 0..8 { let cfg = Arc::clone(&config); scope.spawn(move || { let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed"); assert!(!reduced.is_empty()); }); } }); } #[test] fn demo_stopwords_effectiveness() { use kreuzberg::stopwords::get_stopwords; use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens}; let en_text = "The machine learning model is trained on the large dataset and achieves good performance"; let en_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed"); println!("\n=== English Example ==="); println!("BEFORE: {} chars", en_text.len()); println!("{}", en_text); println!( "\nAFTER: {} chars ({}% reduction)", en_result.len(), 100 - (en_result.len() * 100 / en_text.len()) ); println!("{}", en_result); let zh_text = "这个人工智能系统可以处理自然语言"; let zh_config = TokenReductionConfig { level: ReductionLevel::Moderate, use_simd: false, ..Default::default() }; let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed"); println!("\n=== Chinese Example ==="); println!("BEFORE: {}", zh_text); println!("AFTER: {}", zh_result); let text = "The artificial intelligence system processes the natural language efficiently"; println!("\n=== Reduction Level Comparison ==="); println!("ORIGINAL: {}", text); for level in [ ReductionLevel::Light, ReductionLevel::Moderate, ReductionLevel::Aggressive, ] { let config = TokenReductionConfig { level, use_simd: false, ..Default::default() }; let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed"); println!( "{:?}: {} chars -> {} chars ({}% reduction)", level, text.len(), result.len(), 100 - (result.len() * 100 / text.len()) ); println!(" {}", result); } let stopwords = get_stopwords("en").expect("Operation failed"); println!("\n=== Stopwords Stats ==="); println!("English stopwords: {}", stopwords.len()); println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::>()); } */