Files
fil/crates/kreuzberg/tests/ocr_quality.rs

633 lines
19 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! OCR quality assessment tests.
//!
//! This module tests OCR quality by comparing against ground truth (native PDF text layers).
//! Adopts techniques from scripts/ocr_quality_report.py:
//! - Token-based precision/recall/F1 scoring
//! - Numeric accuracy tracking (critical for tables, data)
//! - Layout fidelity (line count preservation)
//! - Markdown structure preservation
//!
//! Test philosophy:
//! - Compare OCR output against searchable PDF text (ground truth)
//! - Measure accuracy with precision, recall, F1 metrics
//! - Track numeric token accuracy separately (higher importance)
//! - Verify layout preservation (line counts, structure)
//! - Assert minimum quality thresholds
#![cfg(all(feature = "ocr", feature = "pdf"))]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
use std::collections::HashMap;
#[derive(Debug, Clone)]
struct TokenScores {
precision: f64,
recall: f64,
f1: f64,
}
impl TokenScores {
fn new(precision: f64, recall: f64) -> Self {
let f1 = if precision + recall == 0.0 {
0.0
} else {
2.0 * precision * recall / (precision + recall)
};
Self { precision, recall, f1 }
}
}
/// Tokenize and normalize text for comparison.
/// Matches Python implementation: lowercase, normalize dashes, remove punctuation.
fn tokenize_text(text: &str) -> HashMap<String, usize> {
let normalized = text
.to_lowercase()
.replace(['\u{2013}', '\u{2014}'], "-")
.chars()
.map(|ch| {
if ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t' {
ch
} else {
' '
}
})
.collect::<String>();
let normalized = normalized
.chars()
.map(|ch| if "()[],.;:+`".contains(ch) { ' ' } else { ch })
.collect::<String>();
let mut tokens: HashMap<String, usize> = HashMap::new();
for token in normalized.split_whitespace() {
*tokens.entry(token.to_string()).or_insert(0) += 1;
}
tokens
}
/// Extract numeric tokens from token map.
/// Critical for measuring accuracy on tables, data, figures.
fn extract_numeric_tokens(tokens: &HashMap<String, usize>) -> HashMap<String, usize> {
let mut numeric_tokens: HashMap<String, usize> = HashMap::new();
for (token, count) in tokens {
let stripped = token.trim_matches(|c: char| "()[]{}".contains(c));
if !stripped.chars().any(|ch| ch.is_ascii_digit()) {
continue;
}
if stripped.chars().any(|ch| ch.is_ascii_alphabetic()) {
continue;
}
*numeric_tokens.entry(stripped.to_string()).or_insert(0) += count;
}
numeric_tokens
}
/// Calculate precision, recall, F1 for token sets.
fn calculate_token_scores(
truth_tokens: &HashMap<String, usize>,
candidate_tokens: &HashMap<String, usize>,
) -> TokenScores {
let truth_total: usize = truth_tokens.values().sum();
let candidate_total: usize = candidate_tokens.values().sum();
if truth_total == 0 && candidate_total == 0 {
return TokenScores::new(1.0, 1.0);
}
let overlap: usize = truth_tokens
.keys()
.map(|token| {
let truth_count = truth_tokens.get(token).unwrap_or(&0);
let candidate_count = candidate_tokens.get(token).unwrap_or(&0);
truth_count.min(candidate_count)
})
.sum();
let precision = if candidate_total > 0 {
overlap as f64 / candidate_total as f64
} else {
0.0
};
let recall = if truth_total > 0 {
overlap as f64 / truth_total as f64
} else {
0.0
};
TokenScores::new(precision, recall)
}
/// Count non-empty lines in text (layout fidelity metric).
fn count_lines(text: &str) -> usize {
text.lines().filter(|line| !line.trim().is_empty()).count()
}
/// Calculate relative layout delta (0.0 = perfect, 1.0 = worst).
fn layout_delta(truth_lines: usize, ocr_lines: usize) -> f64 {
if truth_lines == 0 {
return if ocr_lines == 0 { 0.0 } else { 1.0 };
}
let delta = (ocr_lines as f64 - truth_lines as f64).abs() / truth_lines as f64;
delta.min(1.0)
}
#[test]
fn test_ocr_quality_simple_text_high_accuracy() {
if skip_if_missing("pdfs/fake_memo.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
let truth_result =
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
assert!(
truth_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
truth_result.detected_languages.is_none(),
"Language detection not enabled"
);
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
assert!(
ocr_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
ocr_result.detected_languages.is_none(),
"Language detection not enabled"
);
println!("Truth content length: {}", truth_result.content.len());
println!("OCR content length: {}", ocr_result.content.len());
println!(
"Truth first 100 chars: {:?}",
&truth_result.content.chars().take(100).collect::<String>()
);
println!(
"OCR first 100 chars: {:?}",
&ocr_result.content.chars().take(100).collect::<String>()
);
let truth_tokens = tokenize_text(&truth_result.content);
let ocr_tokens = tokenize_text(&ocr_result.content);
println!("Truth token count: {}", truth_tokens.len());
println!("OCR token count: {}", ocr_tokens.len());
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
println!("Simple text OCR quality:");
println!(" Precision: {:.3}", scores.precision);
println!(" Recall: {:.3}", scores.recall);
println!(" F1: {:.3}", scores.f1);
assert!(
scores.f1 >= 0.70,
"OCR F1 score too low: {:.3} (expected >= 0.70). Precision: {:.3}, Recall: {:.3}",
scores.f1,
scores.precision,
scores.recall
);
}
#[test]
fn test_ocr_quality_numeric_accuracy() {
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
let truth_result =
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
assert!(
truth_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
truth_result.detected_languages.is_none(),
"Language detection not enabled"
);
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
assert!(
ocr_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
ocr_result.detected_languages.is_none(),
"Language detection not enabled"
);
let truth_tokens = tokenize_text(&truth_result.content);
let ocr_tokens = tokenize_text(&ocr_result.content);
let truth_numeric = extract_numeric_tokens(&truth_tokens);
let ocr_numeric = extract_numeric_tokens(&ocr_tokens);
if !truth_numeric.is_empty() {
let numeric_scores = calculate_token_scores(&truth_numeric, &ocr_numeric);
println!("Numeric token OCR quality:");
println!(" Precision: {:.3}", numeric_scores.precision);
println!(" Recall: {:.3}", numeric_scores.recall);
println!(" F1: {:.3}", numeric_scores.f1);
println!(" Numeric tokens in truth: {}", truth_numeric.len());
println!(" Numeric tokens in OCR: {}", ocr_numeric.len());
assert!(
numeric_scores.f1 >= 0.75,
"Numeric F1 score too low: {:.3} (expected >= 0.75). Numbers must be accurate!",
numeric_scores.f1
);
}
}
#[test]
fn test_ocr_quality_layout_preservation() {
if skip_if_missing("pdfs/fake_memo.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
let truth_result =
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
assert!(
truth_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
truth_result.detected_languages.is_none(),
"Language detection not enabled"
);
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
assert!(
ocr_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
ocr_result.detected_languages.is_none(),
"Language detection not enabled"
);
let truth_lines = count_lines(&truth_result.content);
let ocr_lines = count_lines(&ocr_result.content);
let delta = layout_delta(truth_lines, ocr_lines);
println!("Layout preservation:");
println!(" Truth lines: {}", truth_lines);
println!(" OCR lines: {}", ocr_lines);
println!(" Layout delta: {:.3}", delta);
assert!(
delta <= 0.40,
"Layout delta too high: {:.3} (expected <= 0.40). Truth: {} lines, OCR: {} lines",
delta,
truth_lines,
ocr_lines
);
}
#[test]
fn test_ocr_quality_technical_document() {
if skip_if_missing("pdfs/code_and_formula.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/code_and_formula.pdf");
let truth_result =
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
assert!(
truth_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
truth_result.detected_languages.is_none(),
"Language detection not enabled"
);
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
assert!(
ocr_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
ocr_result.detected_languages.is_none(),
"Language detection not enabled"
);
let truth_tokens = tokenize_text(&truth_result.content);
let ocr_tokens = tokenize_text(&ocr_result.content);
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
println!("Technical document OCR quality:");
println!(" Precision: {:.3}", scores.precision);
println!(" Recall: {:.3}", scores.recall);
println!(" F1: {:.3}", scores.f1);
assert!(
scores.f1 >= 0.60,
"Technical document F1 score too low: {:.3} (expected >= 0.60)",
scores.f1
);
}
#[test]
fn test_ocr_consistency_across_runs() {
if skip_if_missing("pdfs/fake_memo.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
use_cache: false,
..Default::default()
};
let result1 = extract_file_sync(&file_path, None, &ocr_config).expect("First OCR run should succeed");
let result2 = extract_file_sync(&file_path, None, &ocr_config).expect("Second OCR run should succeed");
let result3 = extract_file_sync(&file_path, None, &ocr_config).expect("Third OCR run should succeed");
assert!(
result1.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(result1.detected_languages.is_none(), "Language detection not enabled");
assert!(
result2.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
assert!(
result3.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(result3.detected_languages.is_none(), "Language detection not enabled");
let tokens1 = tokenize_text(&result1.content);
let tokens2 = tokenize_text(&result2.content);
let tokens3 = tokenize_text(&result3.content);
let scores_1_2 = calculate_token_scores(&tokens1, &tokens2);
let scores_1_3 = calculate_token_scores(&tokens1, &tokens3);
println!("OCR consistency across runs:");
println!(" Run1 vs Run2 F1: {:.3}", scores_1_2.f1);
println!(" Run1 vs Run3 F1: {:.3}", scores_1_3.f1);
assert!(
scores_1_2.f1 >= 0.98,
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
scores_1_2.f1
);
assert!(
scores_1_3.f1 >= 0.98,
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
scores_1_3.f1
);
}
#[test]
fn test_ocr_consistency_with_different_psm() {
if skip_if_missing("pdfs/fake_memo.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
let config_psm3 = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: Some(kreuzberg::types::TesseractConfig {
psm: 3,
..Default::default()
}),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let config_psm6 = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: Some(kreuzberg::types::TesseractConfig {
psm: 6,
..Default::default()
}),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let result_psm3 = extract_file_sync(&file_path, None, &config_psm3).expect("PSM 3 extraction should succeed");
let result_psm6 = extract_file_sync(&file_path, None, &config_psm6).expect("PSM 6 extraction should succeed");
assert!(
result_psm3.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
result_psm3.detected_languages.is_none(),
"Language detection not enabled"
);
assert!(
result_psm6.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
result_psm6.detected_languages.is_none(),
"Language detection not enabled"
);
let tokens_psm3 = tokenize_text(&result_psm3.content);
let tokens_psm6 = tokenize_text(&result_psm6.content);
let scores = calculate_token_scores(&tokens_psm3, &tokens_psm6);
println!("OCR consistency across PSM modes:");
println!(" PSM 3 vs PSM 6 F1: {:.3}", scores.f1);
assert!(
scores.f1 >= 0.85,
"PSM modes produce too different results: F1 {:.3} (expected >= 0.85)",
scores.f1
);
}
#[test]
fn test_ocr_quality_multi_page_consistency() {
if skip_if_missing("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf") {
return;
}
if std::env::var_os("KREUZBERG_RUN_FULL_OCR").is_none() {
println!("Skipping test_ocr_quality_multi_page_consistency: set KREUZBERG_RUN_FULL_OCR=1 to enable");
return;
}
let file_path = get_test_file_path("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf");
let truth_result =
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
assert!(
truth_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
truth_result.detected_languages.is_none(),
"Language detection not enabled"
);
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
assert!(
ocr_result.chunks.is_none(),
"Chunks should be None without chunking config"
);
assert!(
ocr_result.detected_languages.is_none(),
"Language detection not enabled"
);
let truth_tokens = tokenize_text(&truth_result.content);
let ocr_tokens = tokenize_text(&ocr_result.content);
let truth_count: usize = truth_tokens.values().sum();
let ocr_count: usize = ocr_tokens.values().sum();
println!("Multi-page document quality:");
println!(" Truth token count: {}", truth_count);
println!(" OCR token count: {}", ocr_count);
assert!(
ocr_count >= (truth_count * 50 / 100),
"OCR extracted too few tokens: {} (expected >= 50% of {})",
ocr_count,
truth_count
);
}
#[test]
fn test_ocr_quality_with_tables() {
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
return;
}
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
let ocr_config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
tesseract_config: Some(kreuzberg::types::TesseractConfig {
enable_table_detection: true,
table_min_confidence: 0.5,
..Default::default()
}),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
let result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with table detection");
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
assert!(result.detected_languages.is_none(), "Language detection not enabled");
println!("Table extraction quality:");
println!(" Tables found: {}", result.tables.len());
println!(" Content length: {}", result.content.len());
assert!(
!result.content.trim().is_empty(),
"OCR with tables should produce content"
);
}