Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,901 @@
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.
#![cfg(any())]
// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.
/*
//! Integration tests for stopwords with token reduction and keywords extraction.
#![cfg(all(feature = "stopwords", feature = "quality"))]
//!
//! These tests verify that stopwords are properly integrated across different features:
//! - Token reduction at all ReductionLevels
//! - Keywords extraction (YAKE and RAKE algorithms)
//! - CJK text processing
//! - Multi-language documents
//! - Language fallback mechanisms
//! - Custom stopwords
use kreuzberg::stopwords::{STOPWORDS, get_stopwords, get_stopwords_with_fallback};
use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
use kreuzberg::keywords::{KeywordConfig, extract_keywords};
use std::collections::HashMap;
fn count_stopwords(text: &str, lang: &str) -> usize {
let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
let words: Vec<&str> = text.split_whitespace().collect();
words
.iter()
.filter(|word| {
let clean = word
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_lowercase();
!clean.is_empty() && stopwords.contains(&clean)
})
.count()
}
fn extract_content_words(text: &str, lang: &str) -> Vec<String> {
let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
let words: Vec<&str> = text.split_whitespace().collect();
words
.iter()
.filter_map(|word| {
let clean = word
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_lowercase();
if !clean.is_empty() && !stopwords.contains(&clean) && clean.len() > 1 {
Some(clean)
} else {
None
}
})
.collect()
}
#[test]
fn test_stopwords_removed_during_moderate_token_reduction() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
language_hint: Some("en".to_string()),
use_simd: false,
..Default::default()
};
let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
assert!(!result.contains(" and "), "Should remove 'and'. Result: {}", result);
assert!(result.contains("quick"), "Should preserve 'quick'. Result: {}", result);
assert!(result.contains("brown"), "Should preserve 'brown'. Result: {}", result);
assert!(result.contains("fox"), "Should preserve 'fox'. Result: {}", result);
assert!(
result.contains("jumping"),
"Should preserve 'jumping'. Result: {}",
result
);
assert!(result.contains("lazy"), "Should preserve 'lazy'. Result: {}", result);
let original_stopwords = count_stopwords(input, "en");
let result_stopwords = count_stopwords(&result, "en");
assert!(
result_stopwords < original_stopwords,
"Result should have fewer stopwords than original. Original: {}, Result: {}",
original_stopwords,
result_stopwords
);
}
#[test]
fn test_stopwords_across_reduction_levels() {
let text = "The machine learning model is trained on the large dataset and achieves good performance";
let light_config = TokenReductionConfig {
level: ReductionLevel::Light,
use_simd: false,
..Default::default()
};
let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
let light_stopwords = count_stopwords(&light_result, "en");
assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
let moderate_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
let moderate_stopwords = count_stopwords(&moderate_result, "en");
assert!(
moderate_stopwords < light_stopwords,
"Moderate reduction should remove more stopwords than light. Light: {}, Moderate: {}",
light_stopwords,
moderate_stopwords
);
let aggressive_config = TokenReductionConfig {
level: ReductionLevel::Aggressive,
use_simd: false,
..Default::default()
};
let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
assert!(
aggressive_result.len() <= moderate_result.len(),
"Aggressive reduction should be more aggressive than moderate"
);
}
#[test]
fn test_stopwords_preserve_semantic_meaning() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input =
"The artificial intelligence system is processing the natural language text for extracting meaningful insights";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
let content_words = extract_content_words(&result, "en");
assert!(
content_words.contains(&"artificial".to_string()) || result.contains("artificial"),
"Should preserve 'artificial'. Result: {}",
result
);
assert!(
content_words.contains(&"intelligence".to_string()) || result.contains("intelligence"),
"Should preserve 'intelligence'. Result: {}",
result
);
assert!(
content_words.contains(&"processing".to_string()) || result.contains("processing"),
"Should preserve 'processing'. Result: {}",
result
);
assert!(
content_words.contains(&"natural".to_string()) || result.contains("natural"),
"Should preserve 'natural'. Result: {}",
result
);
assert!(
content_words.contains(&"language".to_string()) || result.contains("language"),
"Should preserve 'language'. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_multiple_languages() {
let en_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let en_input = "The computer science program is very comprehensive and includes many courses";
let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
let en_original_stopwords = count_stopwords(en_input, "en");
let en_result_stopwords = count_stopwords(&en_result, "en");
assert!(
en_result_stopwords < en_original_stopwords,
"English stopwords should be removed"
);
let es_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
let es_original_stopwords = count_stopwords(es_input, "es");
let es_result_stopwords = count_stopwords(&es_result, "es");
assert!(
es_result_stopwords < es_original_stopwords,
"Spanish stopwords should be removed"
);
assert!(
es_result.contains("programa") || es_result.contains("ciencias") || es_result.contains("computación"),
"Should preserve Spanish content words. Result: {}",
es_result
);
let de_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
let de_original_stopwords = count_stopwords(de_input, "de");
let de_result_stopwords = count_stopwords(&de_result, "de");
assert!(
de_result_stopwords < de_original_stopwords,
"German stopwords should be removed"
);
}
#[test]
fn test_language_fallback_to_english_stopwords() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The system is processing the data with the algorithm";
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
let original_stopwords = count_stopwords(input, "en");
let result_stopwords = count_stopwords(&result, "en");
assert!(
result_stopwords < original_stopwords,
"Should fallback to English stopwords for unsupported language"
);
}
#[test]
fn test_custom_stopwords_integration() {
let mut custom_stopwords = HashMap::new();
custom_stopwords.insert(
"en".to_string(),
vec!["algorithm".to_string(), "system".to_string(), "data".to_string()],
);
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
custom_stopwords: Some(custom_stopwords),
..Default::default()
};
let input = "The algorithm processes the data in the system efficiently";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(
!result.contains("algorithm"),
"Should remove custom stopword 'algorithm'. Result: {}",
result
);
assert!(
!result.contains("system"),
"Should remove custom stopword 'system'. Result: {}",
result
);
assert!(
!result.contains("data"),
"Should remove custom stopword 'data'. Result: {}",
result
);
assert!(
result.contains("processes") || result.contains("efficiently"),
"Should preserve non-stopword content. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_chinese_text() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "这个人工智能系统可以处理自然语言";
let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
assert!(
!result.is_empty(),
"Chinese text should be processed. Result: {}",
result
);
assert!(
result.contains("人工") || result.contains("智能") || result.contains("语言"),
"Should preserve important Chinese terms. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_mixed_cjk_english() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The machine learning model 机器学习模型 is processing data efficiently";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(
!result.contains(" the ") && !result.contains("The "),
"Should remove English 'the'. Result: {}",
result
);
assert!(
result.contains("machine") || result.contains("learning"),
"Should preserve English content. Result: {}",
result
);
assert!(
result.contains("机器") || result.contains("学习") || result.contains("模型"),
"Should preserve Chinese content. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_japanese_text() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "人工知能技術の研究開発";
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
assert!(
!result.is_empty(),
"Japanese text should be processed. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_korean_text() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "인공 지능 기술 개발";
let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
assert!(
!result.is_empty(),
"Korean text should be processed. Result: {}",
result
);
}
#[cfg(feature = "keywords-rake")]
#[test]
fn test_stopwords_excluded_from_rake_keywords() {
let text = "The machine learning model is trained on a large dataset. \
The model uses neural networks and deep learning algorithms. \
The training process requires significant computational resources.";
let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
let keywords = extract_keywords(text, &config).expect("Operation failed");
assert!(!keywords.is_empty(), "Should extract keywords");
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
for keyword in &keywords {
let words: Vec<&str> = keyword.text.split_whitespace().collect();
let all_stopwords = words.iter().all(|word| {
let clean = word
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_lowercase();
en_stopwords.contains(&clean)
});
assert!(
!all_stopwords,
"Keyword '{}' should not be composed entirely of stopwords",
keyword.text
);
}
let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
assert!(
keyword_texts.iter().any(|k| k.contains("machine learning")
|| k.contains("neural networks")
|| k.contains("deep learning")
|| k.contains("dataset")
|| k.contains("model")
|| k.contains("training")),
"Should extract meaningful technical keywords. Got: {:?}",
keyword_texts
);
}
#[cfg(feature = "keywords-yake")]
#[test]
fn test_stopwords_excluded_from_yake_keywords() {
let text = "Natural language processing enables computers to understand human language. \
Deep learning models achieve state-of-the-art performance in text analysis. \
These systems can extract meaningful information from large text corpora.";
let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
let keywords = extract_keywords(text, &config).expect("Operation failed");
assert!(!keywords.is_empty(), "Should extract keywords");
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
for keyword in &keywords {
let has_content_word = keyword.text.split_whitespace().any(|word| {
let clean = word
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_lowercase();
!clean.is_empty() && !en_stopwords.contains(&clean)
});
assert!(
has_content_word,
"Keyword '{}' should contain at least one content word (non-stopword)",
keyword.text
);
}
}
#[cfg(feature = "keywords-rake")]
#[test]
fn test_keywords_respect_language_specific_stopwords() {
let spanish_text = "El aprendizaje automático es una rama de la inteligencia artificial. \
Los modelos de aprendizaje profundo logran un rendimiento excepcional. \
Estos sistemas pueden procesar grandes cantidades de datos.";
let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
assert!(!keywords.is_empty(), "Should extract Spanish keywords");
let es_stopwords = get_stopwords("es").expect("Spanish stopwords must exist");
for keyword in &keywords {
let words: Vec<&str> = keyword.text.split_whitespace().collect();
let all_stopwords = words.iter().all(|word| {
let clean = word
.chars()
.filter(|c| c.is_alphabetic())
.collect::<String>()
.to_lowercase();
es_stopwords.contains(&clean)
});
assert!(
!all_stopwords,
"Spanish keyword '{}' should not be all stopwords",
keyword.text
);
}
let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
assert!(
keyword_texts.iter().any(|k| k.contains("aprendizaje")
|| k.contains("inteligencia")
|| k.contains("modelos")
|| k.contains("datos")),
"Should extract meaningful Spanish keywords. Got: {:?}",
keyword_texts
);
}
#[test]
fn test_all_stopwords_text_reduction() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "the is a an and or but of to in for on at by";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(
result.len() < input.len(),
"Text of all stopwords should be significantly reduced"
);
}
#[test]
fn test_no_stopwords_text_reduction() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
let input_words: Vec<&str> = input.split_whitespace().collect();
let result_lower = result.to_lowercase();
for word in input_words {
let word_lower = word.to_lowercase();
assert!(
result_lower.contains(&word_lower),
"Technical term '{}' should be preserved. Result: {}",
word,
result
);
}
}
#[test]
fn test_mixed_case_stopwords_removal() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
let result_words: Vec<&str> = result.split_whitespace().collect();
assert!(
!result_words.contains(&"the"),
"Should remove lowercase 'the'. Result: {}",
result
);
assert!(
!result_words.contains(&"is"),
"Should remove lowercase 'is'. Result: {}",
result
);
assert!(
result.contains("SYSTEM"),
"Should preserve 'SYSTEM'. Result: {}",
result
);
assert!(result.contains("DATA"), "Should preserve 'DATA'. Result: {}", result);
assert!(
result.contains("ALGORITHM"),
"Should preserve 'ALGORITHM'. Result: {}",
result
);
}
#[test]
fn test_reduce_tokens_function_with_stopwords() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let text = "The artificial intelligence system processes the natural language efficiently";
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
let original_stopwords = count_stopwords(text, "en");
let result_stopwords = count_stopwords(&result, "en");
assert!(
result_stopwords < original_stopwords,
"reduce_tokens should remove stopwords. Original: {}, Result: {}",
original_stopwords,
result_stopwords
);
assert!(
result.contains("artificial") || result.contains("intelligence"),
"Should preserve content words. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_punctuation() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The system, which is processing the data, uses the algorithm.";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(
!result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
"Should remove most instances of 'the'. Result: {}",
result
);
assert!(
result.contains("system") || result.contains("processing") || result.contains("algorithm"),
"Should preserve content words. Result: {}",
result
);
}
#[test]
fn test_stopwords_with_numbers() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The model has 100 layers and processes the data in 10 seconds";
let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert!(
result.contains("100"),
"Should preserve number '100'. Result: {}",
result
);
assert!(result.contains("10"), "Should preserve number '10'. Result: {}", result);
assert!(
result.contains("model") || result.contains("layers") || result.contains("processes"),
"Should preserve content words. Result: {}",
result
);
}
#[test]
fn test_stopwords_removal_consistency_across_calls() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The machine learning model is trained on the dataset";
let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
assert_eq!(result1, result2, "Results should be consistent across calls");
assert_eq!(result2, result3, "Results should be consistent across calls");
}
#[test]
fn test_stopwords_with_long_text() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
enable_parallel: false,
..Default::default()
};
let paragraph = "The machine learning model is trained on the large dataset. \
The training process uses the neural network architecture. \
The system processes the data efficiently and achieves the best performance. ";
let input = paragraph.repeat(10);
let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
assert!(
result.len() < input.len(),
"Long stopword-heavy text should be reduced. Input: {} chars, Result: {} chars",
input.len(),
result.len()
);
let original_stopwords = count_stopwords(&input, "en");
let result_stopwords = count_stopwords(&result, "en");
assert!(
result_stopwords < original_stopwords,
"Should remove stopwords from long text. Original: {}, Result: {}",
original_stopwords,
result_stopwords
);
}
#[test]
fn test_get_stopwords_with_fallback_in_reduction() {
let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
assert!(primary_stopwords.is_some(), "Should fallback to English");
let en_stopwords = get_stopwords("en").expect("Operation failed");
assert_eq!(
primary_stopwords.expect("Operation failed").len(),
en_stopwords.len(),
"Fallback should return English stopwords"
);
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let input = "The system is processing the data";
let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
assert!(
!result.contains(" the ") && !result.contains(" is "),
"Should use fallback stopwords. Result: {}",
result
);
}
#[test]
fn test_stopwords_registry_completeness() {
assert_eq!(STOPWORDS.len(), 64, "Should have exactly 64 language stopword sets");
let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
assert!(en_stopwords.len() >= 70, "English should have at least 70 stopwords");
assert!(en_stopwords.contains("the"), "Should contain 'the'");
assert!(en_stopwords.contains("is"), "Should contain 'is'");
assert!(en_stopwords.contains("and"), "Should contain 'and'");
assert!(en_stopwords.contains("a"), "Should contain 'a'");
assert!(en_stopwords.contains("an"), "Should contain 'an'");
assert!(en_stopwords.contains("of"), "Should contain 'of'");
assert!(en_stopwords.contains("to"), "Should contain 'to'");
assert!(en_stopwords.contains("in"), "Should contain 'in'");
assert!(en_stopwords.contains("for"), "Should contain 'for'");
}
#[test]
fn test_token_reduction_handles_nan_threshold() {
let mut config = TokenReductionConfig {
level: ReductionLevel::Maximum,
semantic_threshold: f32::NAN,
enable_semantic_clustering: true,
target_reduction: Some(0.5),
..Default::default()
};
config.language_hint = Some("en".to_string());
let input = "Critical system update highlights performance improvements across distributed modules.";
let result = reduce_tokens(input, &config, Some("en")).unwrap_or_else(|_| String::new());
assert!(
result.chars().all(|c| !c.is_control()),
"Result should not contain unexpected control characters"
);
}
#[test]
fn test_token_reduction_handles_multibyte_utf8() {
let config = TokenReductionConfig {
level: ReductionLevel::Moderate,
language_hint: Some("ja".to_string()),
..Default::default()
};
let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
assert!(
result.contains("品質管理") || result.contains("漢字処理"),
"Important multibyte terms should survive reduction: {}",
result
);
}
#[test]
fn test_token_reduction_concurrent_access() {
use std::sync::Arc;
let config = Arc::new(TokenReductionConfig {
level: ReductionLevel::Aggressive,
enable_parallel: true,
..Default::default()
});
let input = "Concurrent reduction ensures thread safety without deadlocks or panics.";
std::thread::scope(|scope| {
for _ in 0..8 {
let cfg = Arc::clone(&config);
scope.spawn(move || {
let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
assert!(!reduced.is_empty());
});
}
});
}
#[test]
fn demo_stopwords_effectiveness() {
use kreuzberg::stopwords::get_stopwords;
use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
let en_text = "The machine learning model is trained on the large dataset and achieves good performance";
let en_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
println!("\n=== English Example ===");
println!("BEFORE: {} chars", en_text.len());
println!("{}", en_text);
println!(
"\nAFTER: {} chars ({}% reduction)",
en_result.len(),
100 - (en_result.len() * 100 / en_text.len())
);
println!("{}", en_result);
let zh_text = "这个人工智能系统可以处理自然语言";
let zh_config = TokenReductionConfig {
level: ReductionLevel::Moderate,
use_simd: false,
..Default::default()
};
let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
println!("\n=== Chinese Example ===");
println!("BEFORE: {}", zh_text);
println!("AFTER: {}", zh_result);
let text = "The artificial intelligence system processes the natural language efficiently";
println!("\n=== Reduction Level Comparison ===");
println!("ORIGINAL: {}", text);
for level in [
ReductionLevel::Light,
ReductionLevel::Moderate,
ReductionLevel::Aggressive,
] {
let config = TokenReductionConfig {
level,
use_simd: false,
..Default::default()
};
let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
println!(
"{:?}: {} chars -> {} chars ({}% reduction)",
level,
text.len(),
result.len(),
100 - (result.len() * 100 / text.len())
);
println!(" {}", result);
}
let stopwords = get_stopwords("en").expect("Operation failed");
println!("\n=== Stopwords Stats ===");
println!("English stopwords: {}", stopwords.len());
println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
}
*/