This commit is contained in:
522
crates/kreuzberg/tests/keywords_quality.rs
Normal file
522
crates/kreuzberg/tests/keywords_quality.rs
Normal file
@@ -0,0 +1,522 @@
|
||||
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
||||
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
||||
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
||||
//! rewritten against the public extraction surface.
|
||||
|
||||
#![cfg(any())]
|
||||
|
||||
// Original content preserved below; recompiled once gating cfg drops.
|
||||
// Disabled by the file-level cfg(any()) above.
|
||||
|
||||
/*
|
||||
//! Keyword extraction quality assessment tests.
|
||||
//!
|
||||
//! This module tests keyword extraction quality by comparing against ground truth keywords.
|
||||
//! Measures precision, recall, and F1 to ensure default configurations work well out of the box.
|
||||
//!
|
||||
//! Test philosophy:
|
||||
//! - Define ground truth keywords for test documents (domain experts would identify these)
|
||||
//! - Measure how well extracted keywords match ground truth
|
||||
//! - Assert minimum quality thresholds for precision/recall/F1
|
||||
//! - Verify domain relevance of extracted terms
|
||||
|
||||
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
||||
use kreuzberg::keywords::{KeywordConfig, extract_keywords};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Ground truth keywords for ML document.
|
||||
/// These are the terms a machine learning expert would identify as key concepts.
|
||||
#[allow(dead_code)]
|
||||
fn get_ml_ground_truth() -> HashSet<&'static str> {
|
||||
[
|
||||
"machine learning",
|
||||
"artificial intelligence",
|
||||
"deep learning",
|
||||
"neural networks",
|
||||
"artificial neural networks",
|
||||
"convolutional neural networks",
|
||||
"algorithms",
|
||||
"training data",
|
||||
"supervised learning",
|
||||
"unsupervised learning",
|
||||
"semi-supervised",
|
||||
"natural language processing",
|
||||
"computer science",
|
||||
"model",
|
||||
"predictions",
|
||||
"data",
|
||||
"learning",
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Ground truth keywords for climate change document.
|
||||
#[allow(dead_code)]
|
||||
fn get_climate_ground_truth() -> HashSet<&'static str> {
|
||||
[
|
||||
"climate change",
|
||||
"global warming",
|
||||
"greenhouse gases",
|
||||
"greenhouse gas emissions",
|
||||
"fossil fuels",
|
||||
"burning fossil fuels",
|
||||
"carbon dioxide",
|
||||
"methane",
|
||||
"temperatures",
|
||||
"weather patterns",
|
||||
"climate system",
|
||||
"human activities",
|
||||
"agriculture",
|
||||
"deforestation",
|
||||
"solar cycle",
|
||||
"earth",
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
struct KeywordQualityScores {
|
||||
precision: f64,
|
||||
recall: f64,
|
||||
f1: f64,
|
||||
exact_matches: usize,
|
||||
partial_matches: usize,
|
||||
total_extracted: usize,
|
||||
total_ground_truth: usize,
|
||||
}
|
||||
|
||||
impl KeywordQualityScores {
|
||||
fn new(exact_matches: usize, partial_matches: usize, total_extracted: usize, total_ground_truth: usize) -> Self {
|
||||
let precision = if total_extracted > 0 {
|
||||
(exact_matches + partial_matches) as f64 / total_extracted as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let recall = if total_ground_truth > 0 {
|
||||
(exact_matches + partial_matches) as f64 / total_ground_truth as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let f1 = if precision + recall > 0.0 {
|
||||
2.0 * precision * recall / (precision + recall)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
Self {
|
||||
precision,
|
||||
recall,
|
||||
f1,
|
||||
exact_matches,
|
||||
partial_matches,
|
||||
total_extracted,
|
||||
total_ground_truth,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate extracted keywords against ground truth.
|
||||
///
|
||||
/// Supports both exact matches and partial matches:
|
||||
/// - Exact: "machine learning" == "machine learning"
|
||||
/// - Partial: "machine" matches "machine learning" (subset)
|
||||
#[allow(dead_code)]
|
||||
fn evaluate_keyword_quality(extracted: &[&str], ground_truth: &HashSet<&str>) -> KeywordQualityScores {
|
||||
let extracted_lower: Vec<String> = extracted.iter().map(|s| s.to_lowercase()).collect();
|
||||
let ground_truth_lower: HashSet<String> = ground_truth.iter().map(|s| s.to_lowercase()).collect();
|
||||
|
||||
let mut exact_matches = 0;
|
||||
let mut partial_matches = 0;
|
||||
let mut matched_ground_truth: HashSet<String> = HashSet::new();
|
||||
|
||||
for extracted_kw in &extracted_lower {
|
||||
if ground_truth_lower.contains(extracted_kw) {
|
||||
exact_matches += 1;
|
||||
matched_ground_truth.insert(extracted_kw.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut found_partial = false;
|
||||
for gt_kw in &ground_truth_lower {
|
||||
if (gt_kw.contains(extracted_kw) || extracted_kw.contains(gt_kw)) && !matched_ground_truth.contains(gt_kw) {
|
||||
partial_matches += 1;
|
||||
matched_ground_truth.insert(gt_kw.clone());
|
||||
found_partial = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !found_partial {
|
||||
for gt_kw in &ground_truth_lower {
|
||||
let gt_words: Vec<&str> = gt_kw.split_whitespace().collect();
|
||||
let ex_words: HashSet<&str> = extracted_kw.split_whitespace().collect();
|
||||
|
||||
let overlap = gt_words.iter().filter(|w| ex_words.contains(*w)).count();
|
||||
if overlap >= gt_words.len() / 2 && overlap > 0 && !matched_ground_truth.contains(gt_kw) {
|
||||
partial_matches += 1;
|
||||
matched_ground_truth.insert(gt_kw.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
KeywordQualityScores::new(
|
||||
exact_matches,
|
||||
partial_matches,
|
||||
extracted_lower.len(),
|
||||
ground_truth_lower.len(),
|
||||
)
|
||||
}
|
||||
|
||||
/// ML document text (subset for testing).
|
||||
#[allow(dead_code)]
|
||||
const ML_DOC_SAMPLE: &str = r#"
|
||||
Machine learning is a branch of artificial intelligence and computer science which focuses on the use of data and algorithms to imitate the way that humans learn.
|
||||
Machine learning algorithms build a model based on sample data, known as training data, to make predictions or decisions without being explicitly programmed to do so.
|
||||
Deep learning is a type of machine learning based on artificial neural networks. The learning process is deep because the structure of artificial neural networks consists of multiple input, output, and hidden layers.
|
||||
Neural networks can be used for supervised, semi-supervised, and unsupervised learning. Convolutional neural networks are commonly applied to analyzing visual imagery.
|
||||
Natural language processing is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.
|
||||
"#;
|
||||
|
||||
/// Climate document text (subset for testing).
|
||||
#[allow(dead_code)]
|
||||
const CLIMATE_DOC_SAMPLE: &str = r#"
|
||||
Climate change refers to long-term shifts in temperatures and weather patterns. These shifts may be natural, such as through variations in the solar cycle.
|
||||
But since the 1800s, human activities have been the main driver of climate change, primarily due to burning fossil fuels like coal, oil, and gas.
|
||||
Burning fossil fuels generates greenhouse gas emissions that act like a blanket wrapped around the Earth, trapping the sun's heat and raising temperatures.
|
||||
The main greenhouse gases that are causing climate change include carbon dioxide and methane. These come from burning fossil fuels for energy, agriculture, and deforestation.
|
||||
Global warming is the long-term heating of Earth's climate system. Climate science reveals that human activity has been the dominant cause of climate change since the mid-20th century.
|
||||
"#;
|
||||
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
#[test]
|
||||
fn test_yake_quality_ml_document_default_config() {
|
||||
let config = KeywordConfig::yake();
|
||||
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_ml_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nYAKE ML Document Quality (Default Config):");
|
||||
println!(" Extracted: {} keywords", scores.total_extracted);
|
||||
println!(" Ground truth: {} keywords", scores.total_ground_truth);
|
||||
println!(" Exact matches: {}", scores.exact_matches);
|
||||
println!(" Partial matches: {}", scores.partial_matches);
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
println!("\nExtracted keywords:");
|
||||
for (i, kw) in keywords.iter().enumerate().take(10) {
|
||||
println!(" {}: {} (score: {:.3})", i + 1, kw.text, kw.score);
|
||||
}
|
||||
|
||||
assert!(
|
||||
scores.precision >= 0.40,
|
||||
"YAKE precision too low with default config: {:.3} (expected >= 0.40). Only {}/{} keywords were relevant.",
|
||||
scores.precision,
|
||||
scores.exact_matches + scores.partial_matches,
|
||||
scores.total_extracted
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.recall >= 0.30,
|
||||
"YAKE recall too low with default config: {:.3} (expected >= 0.30). Only {}/{} ground truth keywords found.",
|
||||
scores.recall,
|
||||
scores.exact_matches + scores.partial_matches,
|
||||
scores.total_ground_truth
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.f1 >= 0.30,
|
||||
"YAKE F1 score too low with default config: {:.3} (expected >= 0.30). Precision: {:.3}, Recall: {:.3}",
|
||||
scores.f1,
|
||||
scores.precision,
|
||||
scores.recall
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
#[test]
|
||||
fn test_rake_quality_ml_document_default_config() {
|
||||
let config = KeywordConfig::rake();
|
||||
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_ml_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nRAKE ML Document Quality (Default Config):");
|
||||
println!(" Extracted: {} keywords", scores.total_extracted);
|
||||
println!(" Ground truth: {} keywords", scores.total_ground_truth);
|
||||
println!(" Exact matches: {}", scores.exact_matches);
|
||||
println!(" Partial matches: {}", scores.partial_matches);
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
println!("\nExtracted keywords:");
|
||||
for (i, kw) in keywords.iter().enumerate().take(10) {
|
||||
println!(" {}: {} (score: {:.3})", i + 1, kw.text, kw.score);
|
||||
}
|
||||
|
||||
assert!(
|
||||
scores.precision >= 0.40,
|
||||
"RAKE precision too low with default config: {:.3} (expected >= 0.40). Only {}/{} keywords were relevant.",
|
||||
scores.precision,
|
||||
scores.exact_matches + scores.partial_matches,
|
||||
scores.total_extracted
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.recall >= 0.30,
|
||||
"RAKE recall too low with default config: {:.3} (expected >= 0.30). Only {}/{} ground truth keywords found.",
|
||||
scores.recall,
|
||||
scores.exact_matches + scores.partial_matches,
|
||||
scores.total_ground_truth
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.f1 >= 0.30,
|
||||
"RAKE F1 score too low with default config: {:.3} (expected >= 0.30). Precision: {:.3}, Recall: {:.3}",
|
||||
scores.f1,
|
||||
scores.precision,
|
||||
scores.recall
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
#[test]
|
||||
fn test_yake_quality_climate_document_default_config() {
|
||||
let config = KeywordConfig::yake();
|
||||
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_climate_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nYAKE Climate Document Quality (Default Config):");
|
||||
println!(" Extracted: {} keywords", scores.total_extracted);
|
||||
println!(" Ground truth: {} keywords", scores.total_ground_truth);
|
||||
println!(" Exact matches: {}", scores.exact_matches);
|
||||
println!(" Partial matches: {}", scores.partial_matches);
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
println!("\nExtracted keywords:");
|
||||
for (i, kw) in keywords.iter().enumerate().take(10) {
|
||||
println!(" {}: {} (score: {:.3})", i + 1, kw.text, kw.score);
|
||||
}
|
||||
|
||||
assert!(
|
||||
scores.precision >= 0.40,
|
||||
"YAKE precision too low: {:.3} (expected >= 0.40)",
|
||||
scores.precision
|
||||
);
|
||||
assert!(
|
||||
scores.recall >= 0.30,
|
||||
"YAKE recall too low: {:.3} (expected >= 0.30)",
|
||||
scores.recall
|
||||
);
|
||||
assert!(
|
||||
scores.f1 >= 0.30,
|
||||
"YAKE F1 too low: {:.3} (expected >= 0.30)",
|
||||
scores.f1
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
#[test]
|
||||
fn test_rake_quality_climate_document_default_config() {
|
||||
let config = KeywordConfig::rake();
|
||||
let keywords = extract_keywords(CLIMATE_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
assert!(!keywords.is_empty(), "Should extract keywords with default config");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_climate_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nRAKE Climate Document Quality (Default Config):");
|
||||
println!(" Extracted: {} keywords", scores.total_extracted);
|
||||
println!(" Ground truth: {} keywords", scores.total_ground_truth);
|
||||
println!(" Exact matches: {}", scores.exact_matches);
|
||||
println!(" Partial matches: {}", scores.partial_matches);
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
println!("\nExtracted keywords:");
|
||||
for (i, kw) in keywords.iter().enumerate().take(10) {
|
||||
println!(" {}: {} (score: {:.3})", i + 1, kw.text, kw.score);
|
||||
}
|
||||
|
||||
assert!(
|
||||
scores.precision >= 0.40,
|
||||
"RAKE precision too low: {:.3} (expected >= 0.40)",
|
||||
scores.precision
|
||||
);
|
||||
assert!(
|
||||
scores.recall >= 0.30,
|
||||
"RAKE recall too low: {:.3} (expected >= 0.30)",
|
||||
scores.recall
|
||||
);
|
||||
assert!(
|
||||
scores.f1 >= 0.30,
|
||||
"RAKE F1 too low: {:.3} (expected >= 0.30)",
|
||||
scores.f1
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
|
||||
#[test]
|
||||
fn test_yake_vs_rake_quality_comparison() {
|
||||
let yake_config = KeywordConfig::yake();
|
||||
let rake_config = KeywordConfig::rake();
|
||||
|
||||
let yake_keywords = extract_keywords(ML_DOC_SAMPLE, &yake_config).expect("Operation failed");
|
||||
let rake_keywords = extract_keywords(ML_DOC_SAMPLE, &rake_config).expect("Operation failed");
|
||||
|
||||
let yake_extracted: Vec<&str> = yake_keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let rake_extracted: Vec<&str> = rake_keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
|
||||
let ground_truth = get_ml_ground_truth();
|
||||
let yake_scores = evaluate_keyword_quality(&yake_extracted, &ground_truth);
|
||||
let rake_scores = evaluate_keyword_quality(&rake_extracted, &ground_truth);
|
||||
|
||||
println!("\nYAKE vs RAKE Quality Comparison (ML Document):");
|
||||
println!(
|
||||
" YAKE F1: {:.3} (P: {:.3}, R: {:.3})",
|
||||
yake_scores.f1, yake_scores.precision, yake_scores.recall
|
||||
);
|
||||
println!(
|
||||
" RAKE F1: {:.3} (P: {:.3}, R: {:.3})",
|
||||
rake_scores.f1, rake_scores.precision, rake_scores.recall
|
||||
);
|
||||
|
||||
assert!(yake_scores.f1 >= 0.25, "YAKE F1 too low: {:.3}", yake_scores.f1);
|
||||
assert!(rake_scores.f1 >= 0.25, "RAKE F1 too low: {:.3}", rake_scores.f1);
|
||||
|
||||
let best_f1 = yake_scores.f1.max(rake_scores.f1);
|
||||
assert!(
|
||||
best_f1 >= 0.30,
|
||||
"Neither algorithm achieved F1 >= 0.30. Best: {:.3}",
|
||||
best_f1
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-yake")]
|
||||
#[test]
|
||||
fn test_yake_quality_with_optimized_config() {
|
||||
let config = KeywordConfig::yake()
|
||||
.with_max_keywords(15)
|
||||
.with_ngram_range(1, 3)
|
||||
.with_min_score(0.0);
|
||||
|
||||
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_ml_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nYAKE ML Document Quality (Optimized Config - max 15, ngrams 1-3):");
|
||||
println!(
|
||||
" F1: {:.3} (P: {:.3}, R: {:.3})",
|
||||
scores.f1, scores.precision, scores.recall
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.recall >= 0.35,
|
||||
"Optimized config should improve recall: {:.3} (expected >= 0.35)",
|
||||
scores.recall
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "keywords-rake")]
|
||||
#[test]
|
||||
fn test_rake_quality_with_optimized_config() {
|
||||
let config = KeywordConfig::rake()
|
||||
.with_max_keywords(15)
|
||||
.with_ngram_range(1, 3)
|
||||
.with_min_score(0.0);
|
||||
|
||||
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
let extracted: Vec<&str> = keywords.iter().map(|k| k.text.as_str()).collect();
|
||||
let ground_truth = get_ml_ground_truth();
|
||||
let scores = evaluate_keyword_quality(&extracted, &ground_truth);
|
||||
|
||||
println!("\nRAKE ML Document Quality (Optimized Config - max 15, ngrams 1-3):");
|
||||
println!(
|
||||
" F1: {:.3} (P: {:.3}, R: {:.3})",
|
||||
scores.f1, scores.precision, scores.recall
|
||||
);
|
||||
|
||||
assert!(
|
||||
scores.recall >= 0.35,
|
||||
"Optimized config should improve recall: {:.3} (expected >= 0.35)",
|
||||
scores.recall
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
||||
#[test]
|
||||
fn test_extracted_keywords_are_domain_relevant() {
|
||||
let config = KeywordConfig::default();
|
||||
let keywords = extract_keywords(ML_DOC_SAMPLE, &config).expect("Operation failed");
|
||||
|
||||
let ml_terms = [
|
||||
"machine",
|
||||
"learning",
|
||||
"artificial",
|
||||
"intelligence",
|
||||
"neural",
|
||||
"network",
|
||||
"deep",
|
||||
"algorithm",
|
||||
"data",
|
||||
"model",
|
||||
"training",
|
||||
"supervised",
|
||||
"unsupervised",
|
||||
"language",
|
||||
"processing",
|
||||
];
|
||||
|
||||
let relevant_count = keywords
|
||||
.iter()
|
||||
.filter(|kw| {
|
||||
let kw_lower = kw.text.to_lowercase();
|
||||
ml_terms.iter().any(|term| kw_lower.contains(term))
|
||||
})
|
||||
.count();
|
||||
|
||||
let relevance_ratio = relevant_count as f64 / keywords.len() as f64;
|
||||
|
||||
println!("\nDomain Relevance Check:");
|
||||
println!(" Extracted keywords: {}", keywords.len());
|
||||
println!(" Domain-relevant keywords: {}", relevant_count);
|
||||
println!(" Relevance ratio: {:.3}", relevance_ratio);
|
||||
|
||||
assert!(
|
||||
relevance_ratio >= 0.70,
|
||||
"Too many irrelevant keywords extracted. Relevance: {:.3} (expected >= 0.70). Relevant: {}/{}",
|
||||
relevance_ratio,
|
||||
relevant_count,
|
||||
keywords.len()
|
||||
);
|
||||
}
|
||||
|
||||
*/
|
||||
Reference in New Issue
Block a user