//! OCR quality assessment tests. //! //! This module tests OCR quality by comparing against ground truth (native PDF text layers). //! Adopts techniques from scripts/ocr_quality_report.py: //! - Token-based precision/recall/F1 scoring //! - Numeric accuracy tracking (critical for tables, data) //! - Layout fidelity (line count preservation) //! - Markdown structure preservation //! //! Test philosophy: //! - Compare OCR output against searchable PDF text (ground truth) //! - Measure accuracy with precision, recall, F1 metrics //! - Track numeric token accuracy separately (higher importance) //! - Verify layout preservation (line counts, structure) //! - Assert minimum quality thresholds #![cfg(all(feature = "ocr", feature = "pdf"))] mod helpers; use helpers::*; use kreuzberg::core::config::{ExtractionConfig, OcrConfig}; use kreuzberg::extract_file_sync; use std::collections::HashMap; #[derive(Debug, Clone)] struct TokenScores { precision: f64, recall: f64, f1: f64, } impl TokenScores { fn new(precision: f64, recall: f64) -> Self { let f1 = if precision + recall == 0.0 { 0.0 } else { 2.0 * precision * recall / (precision + recall) }; Self { precision, recall, f1 } } } /// Tokenize and normalize text for comparison. /// Matches Python implementation: lowercase, normalize dashes, remove punctuation. fn tokenize_text(text: &str) -> HashMap { let normalized = text .to_lowercase() .replace(['\u{2013}', '\u{2014}'], "-") .chars() .map(|ch| { if ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t' { ch } else { ' ' } }) .collect::(); let normalized = normalized .chars() .map(|ch| if "()[],.;:+`".contains(ch) { ' ' } else { ch }) .collect::(); let mut tokens: HashMap = HashMap::new(); for token in normalized.split_whitespace() { *tokens.entry(token.to_string()).or_insert(0) += 1; } tokens } /// Extract numeric tokens from token map. /// Critical for measuring accuracy on tables, data, figures. fn extract_numeric_tokens(tokens: &HashMap) -> HashMap { let mut numeric_tokens: HashMap = HashMap::new(); for (token, count) in tokens { let stripped = token.trim_matches(|c: char| "()[]{}".contains(c)); if !stripped.chars().any(|ch| ch.is_ascii_digit()) { continue; } if stripped.chars().any(|ch| ch.is_ascii_alphabetic()) { continue; } *numeric_tokens.entry(stripped.to_string()).or_insert(0) += count; } numeric_tokens } /// Calculate precision, recall, F1 for token sets. fn calculate_token_scores( truth_tokens: &HashMap, candidate_tokens: &HashMap, ) -> TokenScores { let truth_total: usize = truth_tokens.values().sum(); let candidate_total: usize = candidate_tokens.values().sum(); if truth_total == 0 && candidate_total == 0 { return TokenScores::new(1.0, 1.0); } let overlap: usize = truth_tokens .keys() .map(|token| { let truth_count = truth_tokens.get(token).unwrap_or(&0); let candidate_count = candidate_tokens.get(token).unwrap_or(&0); truth_count.min(candidate_count) }) .sum(); let precision = if candidate_total > 0 { overlap as f64 / candidate_total as f64 } else { 0.0 }; let recall = if truth_total > 0 { overlap as f64 / truth_total as f64 } else { 0.0 }; TokenScores::new(precision, recall) } /// Count non-empty lines in text (layout fidelity metric). fn count_lines(text: &str) -> usize { text.lines().filter(|line| !line.trim().is_empty()).count() } /// Calculate relative layout delta (0.0 = perfect, 1.0 = worst). fn layout_delta(truth_lines: usize, ocr_lines: usize) -> f64 { if truth_lines == 0 { return if ocr_lines == 0 { 0.0 } else { 1.0 }; } let delta = (ocr_lines as f64 - truth_lines as f64).abs() / truth_lines as f64; delta.min(1.0) } #[test] fn test_ocr_quality_simple_text_high_accuracy() { if skip_if_missing("pdfs/fake_memo.pdf") { return; } let file_path = get_test_file_path("pdfs/fake_memo.pdf"); let truth_result = extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text"); assert!( truth_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( truth_result.detected_languages.is_none(), "Language detection not enabled" ); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR"); assert!( ocr_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( ocr_result.detected_languages.is_none(), "Language detection not enabled" ); println!("Truth content length: {}", truth_result.content.len()); println!("OCR content length: {}", ocr_result.content.len()); println!( "Truth first 100 chars: {:?}", &truth_result.content.chars().take(100).collect::() ); println!( "OCR first 100 chars: {:?}", &ocr_result.content.chars().take(100).collect::() ); let truth_tokens = tokenize_text(&truth_result.content); let ocr_tokens = tokenize_text(&ocr_result.content); println!("Truth token count: {}", truth_tokens.len()); println!("OCR token count: {}", ocr_tokens.len()); let scores = calculate_token_scores(&truth_tokens, &ocr_tokens); println!("Simple text OCR quality:"); println!(" Precision: {:.3}", scores.precision); println!(" Recall: {:.3}", scores.recall); println!(" F1: {:.3}", scores.f1); assert!( scores.f1 >= 0.70, "OCR F1 score too low: {:.3} (expected >= 0.70). Precision: {:.3}, Recall: {:.3}", scores.f1, scores.precision, scores.recall ); } #[test] fn test_ocr_quality_numeric_accuracy() { if skip_if_missing("pdfs/embedded_images_tables.pdf") { return; } let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf"); let truth_result = extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text"); assert!( truth_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( truth_result.detected_languages.is_none(), "Language detection not enabled" ); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR"); assert!( ocr_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( ocr_result.detected_languages.is_none(), "Language detection not enabled" ); let truth_tokens = tokenize_text(&truth_result.content); let ocr_tokens = tokenize_text(&ocr_result.content); let truth_numeric = extract_numeric_tokens(&truth_tokens); let ocr_numeric = extract_numeric_tokens(&ocr_tokens); if !truth_numeric.is_empty() { let numeric_scores = calculate_token_scores(&truth_numeric, &ocr_numeric); println!("Numeric token OCR quality:"); println!(" Precision: {:.3}", numeric_scores.precision); println!(" Recall: {:.3}", numeric_scores.recall); println!(" F1: {:.3}", numeric_scores.f1); println!(" Numeric tokens in truth: {}", truth_numeric.len()); println!(" Numeric tokens in OCR: {}", ocr_numeric.len()); assert!( numeric_scores.f1 >= 0.75, "Numeric F1 score too low: {:.3} (expected >= 0.75). Numbers must be accurate!", numeric_scores.f1 ); } } #[test] fn test_ocr_quality_layout_preservation() { if skip_if_missing("pdfs/fake_memo.pdf") { return; } let file_path = get_test_file_path("pdfs/fake_memo.pdf"); let truth_result = extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text"); assert!( truth_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( truth_result.detected_languages.is_none(), "Language detection not enabled" ); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR"); assert!( ocr_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( ocr_result.detected_languages.is_none(), "Language detection not enabled" ); let truth_lines = count_lines(&truth_result.content); let ocr_lines = count_lines(&ocr_result.content); let delta = layout_delta(truth_lines, ocr_lines); println!("Layout preservation:"); println!(" Truth lines: {}", truth_lines); println!(" OCR lines: {}", ocr_lines); println!(" Layout delta: {:.3}", delta); assert!( delta <= 0.40, "Layout delta too high: {:.3} (expected <= 0.40). Truth: {} lines, OCR: {} lines", delta, truth_lines, ocr_lines ); } #[test] fn test_ocr_quality_technical_document() { if skip_if_missing("pdfs/code_and_formula.pdf") { return; } let file_path = get_test_file_path("pdfs/code_and_formula.pdf"); let truth_result = extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text"); assert!( truth_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( truth_result.detected_languages.is_none(), "Language detection not enabled" ); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR"); assert!( ocr_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( ocr_result.detected_languages.is_none(), "Language detection not enabled" ); let truth_tokens = tokenize_text(&truth_result.content); let ocr_tokens = tokenize_text(&ocr_result.content); let scores = calculate_token_scores(&truth_tokens, &ocr_tokens); println!("Technical document OCR quality:"); println!(" Precision: {:.3}", scores.precision); println!(" Recall: {:.3}", scores.recall); println!(" F1: {:.3}", scores.f1); assert!( scores.f1 >= 0.60, "Technical document F1 score too low: {:.3} (expected >= 0.60)", scores.f1 ); } #[test] fn test_ocr_consistency_across_runs() { if skip_if_missing("pdfs/fake_memo.pdf") { return; } let file_path = get_test_file_path("pdfs/fake_memo.pdf"); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, use_cache: false, ..Default::default() }; let result1 = extract_file_sync(&file_path, None, &ocr_config).expect("First OCR run should succeed"); let result2 = extract_file_sync(&file_path, None, &ocr_config).expect("Second OCR run should succeed"); let result3 = extract_file_sync(&file_path, None, &ocr_config).expect("Third OCR run should succeed"); assert!( result1.chunks.is_none(), "Chunks should be None without chunking config" ); assert!(result1.detected_languages.is_none(), "Language detection not enabled"); assert!( result2.chunks.is_none(), "Chunks should be None without chunking config" ); assert!(result2.detected_languages.is_none(), "Language detection not enabled"); assert!( result3.chunks.is_none(), "Chunks should be None without chunking config" ); assert!(result3.detected_languages.is_none(), "Language detection not enabled"); let tokens1 = tokenize_text(&result1.content); let tokens2 = tokenize_text(&result2.content); let tokens3 = tokenize_text(&result3.content); let scores_1_2 = calculate_token_scores(&tokens1, &tokens2); let scores_1_3 = calculate_token_scores(&tokens1, &tokens3); println!("OCR consistency across runs:"); println!(" Run1 vs Run2 F1: {:.3}", scores_1_2.f1); println!(" Run1 vs Run3 F1: {:.3}", scores_1_3.f1); assert!( scores_1_2.f1 >= 0.98, "OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)", scores_1_2.f1 ); assert!( scores_1_3.f1 >= 0.98, "OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)", scores_1_3.f1 ); } #[test] fn test_ocr_consistency_with_different_psm() { if skip_if_missing("pdfs/fake_memo.pdf") { return; } let file_path = get_test_file_path("pdfs/fake_memo.pdf"); let config_psm3 = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), tesseract_config: Some(kreuzberg::types::TesseractConfig { psm: 3, ..Default::default() }), ..Default::default() }), force_ocr: true, ..Default::default() }; let config_psm6 = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), tesseract_config: Some(kreuzberg::types::TesseractConfig { psm: 6, ..Default::default() }), ..Default::default() }), force_ocr: true, ..Default::default() }; let result_psm3 = extract_file_sync(&file_path, None, &config_psm3).expect("PSM 3 extraction should succeed"); let result_psm6 = extract_file_sync(&file_path, None, &config_psm6).expect("PSM 6 extraction should succeed"); assert!( result_psm3.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( result_psm3.detected_languages.is_none(), "Language detection not enabled" ); assert!( result_psm6.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( result_psm6.detected_languages.is_none(), "Language detection not enabled" ); let tokens_psm3 = tokenize_text(&result_psm3.content); let tokens_psm6 = tokenize_text(&result_psm6.content); let scores = calculate_token_scores(&tokens_psm3, &tokens_psm6); println!("OCR consistency across PSM modes:"); println!(" PSM 3 vs PSM 6 F1: {:.3}", scores.f1); assert!( scores.f1 >= 0.85, "PSM modes produce too different results: F1 {:.3} (expected >= 0.85)", scores.f1 ); } #[test] fn test_ocr_quality_multi_page_consistency() { if skip_if_missing("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf") { return; } if std::env::var_os("KREUZBERG_RUN_FULL_OCR").is_none() { println!("Skipping test_ocr_quality_multi_page_consistency: set KREUZBERG_RUN_FULL_OCR=1 to enable"); return; } let file_path = get_test_file_path("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf"); let truth_result = extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text"); assert!( truth_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( truth_result.detected_languages.is_none(), "Language detection not enabled" ); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, ..Default::default() }; let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR"); assert!( ocr_result.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( ocr_result.detected_languages.is_none(), "Language detection not enabled" ); let truth_tokens = tokenize_text(&truth_result.content); let ocr_tokens = tokenize_text(&ocr_result.content); let truth_count: usize = truth_tokens.values().sum(); let ocr_count: usize = ocr_tokens.values().sum(); println!("Multi-page document quality:"); println!(" Truth token count: {}", truth_count); println!(" OCR token count: {}", ocr_count); assert!( ocr_count >= (truth_count * 50 / 100), "OCR extracted too few tokens: {} (expected >= 50% of {})", ocr_count, truth_count ); } #[test] fn test_ocr_quality_with_tables() { if skip_if_missing("pdfs/embedded_images_tables.pdf") { return; } let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf"); let ocr_config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), tesseract_config: Some(kreuzberg::types::TesseractConfig { enable_table_detection: true, table_min_confidence: 0.5, ..Default::default() }), ..Default::default() }), force_ocr: true, ..Default::default() }; let result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with table detection"); assert!(result.chunks.is_none(), "Chunks should be None without chunking config"); assert!(result.detected_languages.is_none(), "Language detection not enabled"); println!("Table extraction quality:"); println!(" Tables found: {}", result.tables.len()); println!(" Content length: {}", result.content.len()); assert!( !result.content.trim().is_empty(), "OCR with tables should produce content" ); }