Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/stopwords_integration_test.rs
+++ b/crates/kreuzberg/tests/stopwords_integration_test.rs
@@ -0,0 +1,901 @@
+//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
+//! pub(crate) APIs that the migration deliberately narrowed; gated until
+//! either (a) these APIs are re-exposed publicly, or (b) the test is
+//! rewritten against the public extraction surface.
+
+#![cfg(any())]
+
+// Original content preserved below; recompiled once gating cfg drops.
+// Disabled by the file-level cfg(any()) above.
+
+/*
+//! Integration tests for stopwords with token reduction and keywords extraction.
+#![cfg(all(feature = "stopwords", feature = "quality"))]
+//!
+//! These tests verify that stopwords are properly integrated across different features:
+//! - Token reduction at all ReductionLevels
+//! - Keywords extraction (YAKE and RAKE algorithms)
+//! - CJK text processing
+//! - Multi-language documents
+//! - Language fallback mechanisms
+//! - Custom stopwords
+
+use kreuzberg::stopwords::{STOPWORDS, get_stopwords, get_stopwords_with_fallback};
+use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
+
+#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+use kreuzberg::keywords::{KeywordConfig, extract_keywords};
+
+use std::collections::HashMap;
+
+fn count_stopwords(text: &str, lang: &str) -> usize {
+    let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
+    let words: Vec<&str> = text.split_whitespace().collect();
+
+    words
+        .iter()
+        .filter(|word| {
+            let clean = word
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+
+            !clean.is_empty() && stopwords.contains(&clean)
+        })
+        .count()
+}
+
+fn extract_content_words(text: &str, lang: &str) -> Vec<String> {
+    let stopwords = get_stopwords(lang).expect("Stopwords must exist for language");
+    let words: Vec<&str> = text.split_whitespace().collect();
+
+    words
+        .iter()
+        .filter_map(|word| {
+            let clean = word
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+
+            if !clean.is_empty() && !stopwords.contains(&clean) && clean.len() > 1 {
+                Some(clean)
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+
+#[test]
+fn test_stopwords_removed_during_moderate_token_reduction() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        language_hint: Some("en".to_string()),
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(!result.contains(" the "), "Should remove 'the'. Result: {}", result);
+    assert!(!result.contains(" is "), "Should remove 'is'. Result: {}", result);
+    assert!(!result.contains(" and "), "Should remove 'and'. Result: {}", result);
+
+    assert!(result.contains("quick"), "Should preserve 'quick'. Result: {}", result);
+    assert!(result.contains("brown"), "Should preserve 'brown'. Result: {}", result);
+    assert!(result.contains("fox"), "Should preserve 'fox'. Result: {}", result);
+    assert!(
+        result.contains("jumping"),
+        "Should preserve 'jumping'. Result: {}",
+        result
+    );
+    assert!(result.contains("lazy"), "Should preserve 'lazy'. Result: {}", result);
+
+    let original_stopwords = count_stopwords(input, "en");
+    let result_stopwords = count_stopwords(&result, "en");
+
+    assert!(
+        result_stopwords < original_stopwords,
+        "Result should have fewer stopwords than original. Original: {}, Result: {}",
+        original_stopwords,
+        result_stopwords
+    );
+}
+
+#[test]
+fn test_stopwords_across_reduction_levels() {
+    let text = "The machine learning model is trained on the large dataset and achieves good performance";
+
+    let light_config = TokenReductionConfig {
+        level: ReductionLevel::Light,
+        use_simd: false,
+        ..Default::default()
+    };
+    let light_result = reduce_tokens(text, &light_config, Some("en")).expect("Operation failed");
+
+    let light_stopwords = count_stopwords(&light_result, "en");
+    assert!(light_stopwords > 0, "Light reduction should preserve some stopwords");
+
+    let moderate_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let moderate_result = reduce_tokens(text, &moderate_config, Some("en")).expect("Operation failed");
+
+    let moderate_stopwords = count_stopwords(&moderate_result, "en");
+    assert!(
+        moderate_stopwords < light_stopwords,
+        "Moderate reduction should remove more stopwords than light. Light: {}, Moderate: {}",
+        light_stopwords,
+        moderate_stopwords
+    );
+
+    let aggressive_config = TokenReductionConfig {
+        level: ReductionLevel::Aggressive,
+        use_simd: false,
+        ..Default::default()
+    };
+    let aggressive_result = reduce_tokens(text, &aggressive_config, Some("en")).expect("Operation failed");
+
+    assert!(
+        aggressive_result.len() <= moderate_result.len(),
+        "Aggressive reduction should be more aggressive than moderate"
+    );
+}
+
+#[test]
+fn test_stopwords_preserve_semantic_meaning() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input =
+        "The artificial intelligence system is processing the natural language text for extracting meaningful insights";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    let content_words = extract_content_words(&result, "en");
+
+    assert!(
+        content_words.contains(&"artificial".to_string()) || result.contains("artificial"),
+        "Should preserve 'artificial'. Result: {}",
+        result
+    );
+    assert!(
+        content_words.contains(&"intelligence".to_string()) || result.contains("intelligence"),
+        "Should preserve 'intelligence'. Result: {}",
+        result
+    );
+    assert!(
+        content_words.contains(&"processing".to_string()) || result.contains("processing"),
+        "Should preserve 'processing'. Result: {}",
+        result
+    );
+    assert!(
+        content_words.contains(&"natural".to_string()) || result.contains("natural"),
+        "Should preserve 'natural'. Result: {}",
+        result
+    );
+    assert!(
+        content_words.contains(&"language".to_string()) || result.contains("language"),
+        "Should preserve 'language'. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_multiple_languages() {
+    let en_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let en_input = "The computer science program is very comprehensive and includes many courses";
+    let en_result = reduce_tokens(en_input, &en_config, Some("en")).expect("Operation failed");
+
+    let en_original_stopwords = count_stopwords(en_input, "en");
+    let en_result_stopwords = count_stopwords(&en_result, "en");
+    assert!(
+        en_result_stopwords < en_original_stopwords,
+        "English stopwords should be removed"
+    );
+
+    let es_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let es_input = "El programa de ciencias de la computación es muy completo y tiene muchos cursos";
+    let es_result = reduce_tokens(es_input, &es_config, Some("es")).expect("Operation failed");
+
+    let es_original_stopwords = count_stopwords(es_input, "es");
+    let es_result_stopwords = count_stopwords(&es_result, "es");
+    assert!(
+        es_result_stopwords < es_original_stopwords,
+        "Spanish stopwords should be removed"
+    );
+
+    assert!(
+        es_result.contains("programa") || es_result.contains("ciencias") || es_result.contains("computación"),
+        "Should preserve Spanish content words. Result: {}",
+        es_result
+    );
+
+    let de_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let de_input = "Die künstliche Intelligenz ist ein wichtiges Forschungsgebiet der Informatik";
+    let de_result = reduce_tokens(de_input, &de_config, Some("de")).expect("Operation failed");
+
+    let de_original_stopwords = count_stopwords(de_input, "de");
+    let de_result_stopwords = count_stopwords(&de_result, "de");
+    assert!(
+        de_result_stopwords < de_original_stopwords,
+        "German stopwords should be removed"
+    );
+}
+
+#[test]
+fn test_language_fallback_to_english_stopwords() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The system is processing the data with the algorithm";
+    let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
+
+    let original_stopwords = count_stopwords(input, "en");
+    let result_stopwords = count_stopwords(&result, "en");
+
+    assert!(
+        result_stopwords < original_stopwords,
+        "Should fallback to English stopwords for unsupported language"
+    );
+}
+
+#[test]
+fn test_custom_stopwords_integration() {
+    let mut custom_stopwords = HashMap::new();
+    custom_stopwords.insert(
+        "en".to_string(),
+        vec!["algorithm".to_string(), "system".to_string(), "data".to_string()],
+    );
+
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        custom_stopwords: Some(custom_stopwords),
+        ..Default::default()
+    };
+
+    let input = "The algorithm processes the data in the system efficiently";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        !result.contains("algorithm"),
+        "Should remove custom stopword 'algorithm'. Result: {}",
+        result
+    );
+    assert!(
+        !result.contains("system"),
+        "Should remove custom stopword 'system'. Result: {}",
+        result
+    );
+    assert!(
+        !result.contains("data"),
+        "Should remove custom stopword 'data'. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("processes") || result.contains("efficiently"),
+        "Should preserve non-stopword content. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_chinese_text() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "这个人工智能系统可以处理自然语言";
+    let result = reduce_tokens(input, &config, Some("zh")).expect("Operation failed");
+
+    assert!(
+        !result.is_empty(),
+        "Chinese text should be processed. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("人工") || result.contains("智能") || result.contains("语言"),
+        "Should preserve important Chinese terms. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_mixed_cjk_english() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The machine learning model 机器学习模型 is processing data efficiently";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        !result.contains(" the ") && !result.contains("The "),
+        "Should remove English 'the'. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("machine") || result.contains("learning"),
+        "Should preserve English content. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("机器") || result.contains("学习") || result.contains("模型"),
+        "Should preserve Chinese content. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_japanese_text() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "人工知能技術の研究開発";
+    let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
+
+    assert!(
+        !result.is_empty(),
+        "Japanese text should be processed. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_korean_text() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "인공 지능 기술 개발";
+    let result = reduce_tokens(input, &config, Some("ko")).expect("Operation failed");
+
+    assert!(
+        !result.is_empty(),
+        "Korean text should be processed. Result: {}",
+        result
+    );
+}
+
+#[cfg(feature = "keywords-rake")]
+#[test]
+fn test_stopwords_excluded_from_rake_keywords() {
+    let text = "The machine learning model is trained on a large dataset. \
+                The model uses neural networks and deep learning algorithms. \
+                The training process requires significant computational resources.";
+
+    let config = KeywordConfig::rake().with_language("en").with_max_keywords(10);
+
+    let keywords = extract_keywords(text, &config).expect("Operation failed");
+
+    assert!(!keywords.is_empty(), "Should extract keywords");
+
+    let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
+
+    for keyword in &keywords {
+        let words: Vec<&str> = keyword.text.split_whitespace().collect();
+
+        let all_stopwords = words.iter().all(|word| {
+            let clean = word
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+            en_stopwords.contains(&clean)
+        });
+
+        assert!(
+            !all_stopwords,
+            "Keyword '{}' should not be composed entirely of stopwords",
+            keyword.text
+        );
+    }
+
+    let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
+
+    assert!(
+        keyword_texts.iter().any(|k| k.contains("machine learning")
+            || k.contains("neural networks")
+            || k.contains("deep learning")
+            || k.contains("dataset")
+            || k.contains("model")
+            || k.contains("training")),
+        "Should extract meaningful technical keywords. Got: {:?}",
+        keyword_texts
+    );
+}
+
+#[cfg(feature = "keywords-yake")]
+#[test]
+fn test_stopwords_excluded_from_yake_keywords() {
+    let text = "Natural language processing enables computers to understand human language. \
+                Deep learning models achieve state-of-the-art performance in text analysis. \
+                These systems can extract meaningful information from large text corpora.";
+
+    let config = KeywordConfig::yake().with_language("en").with_max_keywords(10);
+
+    let keywords = extract_keywords(text, &config).expect("Operation failed");
+
+    assert!(!keywords.is_empty(), "Should extract keywords");
+
+    let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
+
+    for keyword in &keywords {
+        let has_content_word = keyword.text.split_whitespace().any(|word| {
+            let clean = word
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+            !clean.is_empty() && !en_stopwords.contains(&clean)
+        });
+
+        assert!(
+            has_content_word,
+            "Keyword '{}' should contain at least one content word (non-stopword)",
+            keyword.text
+        );
+    }
+}
+
+#[cfg(feature = "keywords-rake")]
+#[test]
+fn test_keywords_respect_language_specific_stopwords() {
+    let spanish_text = "El aprendizaje automático es una rama de la inteligencia artificial. \
+                        Los modelos de aprendizaje profundo logran un rendimiento excepcional. \
+                        Estos sistemas pueden procesar grandes cantidades de datos.";
+
+    let config = KeywordConfig::rake().with_language("es").with_max_keywords(8);
+
+    let keywords = extract_keywords(spanish_text, &config).expect("Operation failed");
+
+    assert!(!keywords.is_empty(), "Should extract Spanish keywords");
+
+    let es_stopwords = get_stopwords("es").expect("Spanish stopwords must exist");
+
+    for keyword in &keywords {
+        let words: Vec<&str> = keyword.text.split_whitespace().collect();
+        let all_stopwords = words.iter().all(|word| {
+            let clean = word
+                .chars()
+                .filter(|c| c.is_alphabetic())
+                .collect::<String>()
+                .to_lowercase();
+            es_stopwords.contains(&clean)
+        });
+
+        assert!(
+            !all_stopwords,
+            "Spanish keyword '{}' should not be all stopwords",
+            keyword.text
+        );
+    }
+
+    let keyword_texts: Vec<String> = keywords.iter().map(|k| k.text.to_lowercase()).collect();
+    assert!(
+        keyword_texts.iter().any(|k| k.contains("aprendizaje")
+            || k.contains("inteligencia")
+            || k.contains("modelos")
+            || k.contains("datos")),
+        "Should extract meaningful Spanish keywords. Got: {:?}",
+        keyword_texts
+    );
+}
+
+#[test]
+fn test_all_stopwords_text_reduction() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "the is a an and or but of to in for on at by";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        result.len() < input.len(),
+        "Text of all stopwords should be significantly reduced"
+    );
+}
+
+#[test]
+fn test_no_stopwords_text_reduction() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "PyTorch TensorFlow CUDA GPU optimization benchmark performance metrics";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    let input_words: Vec<&str> = input.split_whitespace().collect();
+    let result_lower = result.to_lowercase();
+
+    for word in input_words {
+        let word_lower = word.to_lowercase();
+        assert!(
+            result_lower.contains(&word_lower),
+            "Technical term '{}' should be preserved. Result: {}",
+            word,
+            result
+        );
+    }
+}
+
+#[test]
+fn test_mixed_case_stopwords_removal() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The SYSTEM Is Processing The DATA With The ALGORITHM";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    let result_words: Vec<&str> = result.split_whitespace().collect();
+    assert!(
+        !result_words.contains(&"the"),
+        "Should remove lowercase 'the'. Result: {}",
+        result
+    );
+    assert!(
+        !result_words.contains(&"is"),
+        "Should remove lowercase 'is'. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("SYSTEM"),
+        "Should preserve 'SYSTEM'. Result: {}",
+        result
+    );
+    assert!(result.contains("DATA"), "Should preserve 'DATA'. Result: {}", result);
+    assert!(
+        result.contains("ALGORITHM"),
+        "Should preserve 'ALGORITHM'. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_reduce_tokens_function_with_stopwords() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let text = "The artificial intelligence system processes the natural language efficiently";
+    let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
+
+    let original_stopwords = count_stopwords(text, "en");
+    let result_stopwords = count_stopwords(&result, "en");
+
+    assert!(
+        result_stopwords < original_stopwords,
+        "reduce_tokens should remove stopwords. Original: {}, Result: {}",
+        original_stopwords,
+        result_stopwords
+    );
+
+    assert!(
+        result.contains("artificial") || result.contains("intelligence"),
+        "Should preserve content words. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_punctuation() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The system, which is processing the data, uses the algorithm.";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        !result.contains(" the ") || result.split_whitespace().filter(|w| w.contains("the")).count() < 3,
+        "Should remove most instances of 'the'. Result: {}",
+        result
+    );
+
+    assert!(
+        result.contains("system") || result.contains("processing") || result.contains("algorithm"),
+        "Should preserve content words. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_with_numbers() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The model has 100 layers and processes the data in 10 seconds";
+    let result = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        result.contains("100"),
+        "Should preserve number '100'. Result: {}",
+        result
+    );
+    assert!(result.contains("10"), "Should preserve number '10'. Result: {}", result);
+
+    assert!(
+        result.contains("model") || result.contains("layers") || result.contains("processes"),
+        "Should preserve content words. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_removal_consistency_across_calls() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The machine learning model is trained on the dataset";
+
+    let result1 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+    let result2 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+    let result3 = reduce_tokens(input, &config, Some("en")).expect("Operation failed");
+
+    assert_eq!(result1, result2, "Results should be consistent across calls");
+    assert_eq!(result2, result3, "Results should be consistent across calls");
+}
+
+#[test]
+fn test_stopwords_with_long_text() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        enable_parallel: false,
+        ..Default::default()
+    };
+
+    let paragraph = "The machine learning model is trained on the large dataset. \
+                     The training process uses the neural network architecture. \
+                     The system processes the data efficiently and achieves the best performance. ";
+    let input = paragraph.repeat(10);
+
+    let result = reduce_tokens(&input, &config, Some("en")).expect("Operation failed");
+
+    assert!(
+        result.len() < input.len(),
+        "Long stopword-heavy text should be reduced. Input: {} chars, Result: {} chars",
+        input.len(),
+        result.len()
+    );
+
+    let original_stopwords = count_stopwords(&input, "en");
+    let result_stopwords = count_stopwords(&result, "en");
+
+    assert!(
+        result_stopwords < original_stopwords,
+        "Should remove stopwords from long text. Original: {}, Result: {}",
+        original_stopwords,
+        result_stopwords
+    );
+}
+
+#[test]
+fn test_get_stopwords_with_fallback_in_reduction() {
+    let primary_stopwords = get_stopwords_with_fallback("xyz", "en");
+    assert!(primary_stopwords.is_some(), "Should fallback to English");
+
+    let en_stopwords = get_stopwords("en").expect("Operation failed");
+    assert_eq!(
+        primary_stopwords.expect("Operation failed").len(),
+        en_stopwords.len(),
+        "Fallback should return English stopwords"
+    );
+
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+
+    let input = "The system is processing the data";
+    let result = reduce_tokens(input, &config, Some("xyz")).expect("Operation failed");
+
+    assert!(
+        !result.contains(" the ") && !result.contains(" is "),
+        "Should use fallback stopwords. Result: {}",
+        result
+    );
+}
+
+#[test]
+fn test_stopwords_registry_completeness() {
+    assert_eq!(STOPWORDS.len(), 64, "Should have exactly 64 language stopword sets");
+
+    let en_stopwords = get_stopwords("en").expect("English stopwords must exist");
+    assert!(en_stopwords.len() >= 70, "English should have at least 70 stopwords");
+
+    assert!(en_stopwords.contains("the"), "Should contain 'the'");
+    assert!(en_stopwords.contains("is"), "Should contain 'is'");
+    assert!(en_stopwords.contains("and"), "Should contain 'and'");
+    assert!(en_stopwords.contains("a"), "Should contain 'a'");
+    assert!(en_stopwords.contains("an"), "Should contain 'an'");
+    assert!(en_stopwords.contains("of"), "Should contain 'of'");
+    assert!(en_stopwords.contains("to"), "Should contain 'to'");
+    assert!(en_stopwords.contains("in"), "Should contain 'in'");
+    assert!(en_stopwords.contains("for"), "Should contain 'for'");
+}
+
+#[test]
+fn test_token_reduction_handles_nan_threshold() {
+    let mut config = TokenReductionConfig {
+        level: ReductionLevel::Maximum,
+        semantic_threshold: f32::NAN,
+        enable_semantic_clustering: true,
+        target_reduction: Some(0.5),
+        ..Default::default()
+    };
+
+    config.language_hint = Some("en".to_string());
+    let input = "Critical system update highlights performance improvements across distributed modules.";
+
+    let result = reduce_tokens(input, &config, Some("en")).unwrap_or_else(|_| String::new());
+    assert!(
+        result.chars().all(|c| !c.is_control()),
+        "Result should not contain unexpected control characters"
+    );
+}
+
+#[test]
+fn test_token_reduction_handles_multibyte_utf8() {
+    let config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        language_hint: Some("ja".to_string()),
+        ..Default::default()
+    };
+
+    let input = "品質管理は重要です。🚀 高速抽出と漢字処理が求められています。";
+    let result = reduce_tokens(input, &config, Some("ja")).expect("Operation failed");
+
+    assert!(
+        result.contains("品質管理") || result.contains("漢字処理"),
+        "Important multibyte terms should survive reduction: {}",
+        result
+    );
+}
+
+#[test]
+fn test_token_reduction_concurrent_access() {
+    use std::sync::Arc;
+
+    let config = Arc::new(TokenReductionConfig {
+        level: ReductionLevel::Aggressive,
+        enable_parallel: true,
+        ..Default::default()
+    });
+
+    let input = "Concurrent reduction ensures thread safety without deadlocks or panics.";
+
+    std::thread::scope(|scope| {
+        for _ in 0..8 {
+            let cfg = Arc::clone(&config);
+            scope.spawn(move || {
+                let reduced = reduce_tokens(input, &cfg, Some("en")).expect("Operation failed");
+                assert!(!reduced.is_empty());
+            });
+        }
+    });
+}
+#[test]
+fn demo_stopwords_effectiveness() {
+    use kreuzberg::stopwords::get_stopwords;
+    use kreuzberg::text::token_reduction::{ReductionLevel, TokenReductionConfig, reduce_tokens};
+
+    let en_text = "The machine learning model is trained on the large dataset and achieves good performance";
+    let en_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let en_result = reduce_tokens(en_text, &en_config, Some("en")).expect("Operation failed");
+
+    println!("\n=== English Example ===");
+    println!("BEFORE: {} chars", en_text.len());
+    println!("{}", en_text);
+    println!(
+        "\nAFTER:  {} chars ({}% reduction)",
+        en_result.len(),
+        100 - (en_result.len() * 100 / en_text.len())
+    );
+    println!("{}", en_result);
+
+    let zh_text = "这个人工智能系统可以处理自然语言";
+    let zh_config = TokenReductionConfig {
+        level: ReductionLevel::Moderate,
+        use_simd: false,
+        ..Default::default()
+    };
+    let zh_result = reduce_tokens(zh_text, &zh_config, Some("zh")).expect("Operation failed");
+
+    println!("\n=== Chinese Example ===");
+    println!("BEFORE: {}", zh_text);
+    println!("AFTER:  {}", zh_result);
+
+    let text = "The artificial intelligence system processes the natural language efficiently";
+
+    println!("\n=== Reduction Level Comparison ===");
+    println!("ORIGINAL: {}", text);
+
+    for level in [
+        ReductionLevel::Light,
+        ReductionLevel::Moderate,
+        ReductionLevel::Aggressive,
+    ] {
+        let config = TokenReductionConfig {
+            level,
+            use_simd: false,
+            ..Default::default()
+        };
+        let result = reduce_tokens(text, &config, Some("en")).expect("Operation failed");
+        println!(
+            "{:?}: {} chars -> {} chars ({}% reduction)",
+            level,
+            text.len(),
+            result.len(),
+            100 - (result.len() * 100 / text.len())
+        );
+        println!("  {}", result);
+    }
+
+    let stopwords = get_stopwords("en").expect("Operation failed");
+    println!("\n=== Stopwords Stats ===");
+    println!("English stopwords: {}", stopwords.len());
+    println!("Sample stopwords: {:?}", stopwords.iter().take(10).collect::<Vec<_>>());
+}
+
+*/