633 lines
19 KiB
Rust
633 lines
19 KiB
Rust
|
|
//! OCR quality assessment tests.
|
||
|
|
//!
|
||
|
|
//! This module tests OCR quality by comparing against ground truth (native PDF text layers).
|
||
|
|
//! Adopts techniques from scripts/ocr_quality_report.py:
|
||
|
|
//! - Token-based precision/recall/F1 scoring
|
||
|
|
//! - Numeric accuracy tracking (critical for tables, data)
|
||
|
|
//! - Layout fidelity (line count preservation)
|
||
|
|
//! - Markdown structure preservation
|
||
|
|
//!
|
||
|
|
//! Test philosophy:
|
||
|
|
//! - Compare OCR output against searchable PDF text (ground truth)
|
||
|
|
//! - Measure accuracy with precision, recall, F1 metrics
|
||
|
|
//! - Track numeric token accuracy separately (higher importance)
|
||
|
|
//! - Verify layout preservation (line counts, structure)
|
||
|
|
//! - Assert minimum quality thresholds
|
||
|
|
|
||
|
|
#![cfg(all(feature = "ocr", feature = "pdf"))]
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
use helpers::*;
|
||
|
|
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
use std::collections::HashMap;
|
||
|
|
|
||
|
|
#[derive(Debug, Clone)]
|
||
|
|
struct TokenScores {
|
||
|
|
precision: f64,
|
||
|
|
recall: f64,
|
||
|
|
f1: f64,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl TokenScores {
|
||
|
|
fn new(precision: f64, recall: f64) -> Self {
|
||
|
|
let f1 = if precision + recall == 0.0 {
|
||
|
|
0.0
|
||
|
|
} else {
|
||
|
|
2.0 * precision * recall / (precision + recall)
|
||
|
|
};
|
||
|
|
Self { precision, recall, f1 }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Tokenize and normalize text for comparison.
|
||
|
|
/// Matches Python implementation: lowercase, normalize dashes, remove punctuation.
|
||
|
|
fn tokenize_text(text: &str) -> HashMap<String, usize> {
|
||
|
|
let normalized = text
|
||
|
|
.to_lowercase()
|
||
|
|
.replace(['\u{2013}', '\u{2014}'], "-")
|
||
|
|
.chars()
|
||
|
|
.map(|ch| {
|
||
|
|
if ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t' {
|
||
|
|
ch
|
||
|
|
} else {
|
||
|
|
' '
|
||
|
|
}
|
||
|
|
})
|
||
|
|
.collect::<String>();
|
||
|
|
|
||
|
|
let normalized = normalized
|
||
|
|
.chars()
|
||
|
|
.map(|ch| if "()[],.;:+`".contains(ch) { ' ' } else { ch })
|
||
|
|
.collect::<String>();
|
||
|
|
|
||
|
|
let mut tokens: HashMap<String, usize> = HashMap::new();
|
||
|
|
for token in normalized.split_whitespace() {
|
||
|
|
*tokens.entry(token.to_string()).or_insert(0) += 1;
|
||
|
|
}
|
||
|
|
tokens
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract numeric tokens from token map.
|
||
|
|
/// Critical for measuring accuracy on tables, data, figures.
|
||
|
|
fn extract_numeric_tokens(tokens: &HashMap<String, usize>) -> HashMap<String, usize> {
|
||
|
|
let mut numeric_tokens: HashMap<String, usize> = HashMap::new();
|
||
|
|
|
||
|
|
for (token, count) in tokens {
|
||
|
|
let stripped = token.trim_matches(|c: char| "()[]{}".contains(c));
|
||
|
|
|
||
|
|
if !stripped.chars().any(|ch| ch.is_ascii_digit()) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if stripped.chars().any(|ch| ch.is_ascii_alphabetic()) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
*numeric_tokens.entry(stripped.to_string()).or_insert(0) += count;
|
||
|
|
}
|
||
|
|
|
||
|
|
numeric_tokens
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Calculate precision, recall, F1 for token sets.
|
||
|
|
fn calculate_token_scores(
|
||
|
|
truth_tokens: &HashMap<String, usize>,
|
||
|
|
candidate_tokens: &HashMap<String, usize>,
|
||
|
|
) -> TokenScores {
|
||
|
|
let truth_total: usize = truth_tokens.values().sum();
|
||
|
|
let candidate_total: usize = candidate_tokens.values().sum();
|
||
|
|
|
||
|
|
if truth_total == 0 && candidate_total == 0 {
|
||
|
|
return TokenScores::new(1.0, 1.0);
|
||
|
|
}
|
||
|
|
|
||
|
|
let overlap: usize = truth_tokens
|
||
|
|
.keys()
|
||
|
|
.map(|token| {
|
||
|
|
let truth_count = truth_tokens.get(token).unwrap_or(&0);
|
||
|
|
let candidate_count = candidate_tokens.get(token).unwrap_or(&0);
|
||
|
|
truth_count.min(candidate_count)
|
||
|
|
})
|
||
|
|
.sum();
|
||
|
|
|
||
|
|
let precision = if candidate_total > 0 {
|
||
|
|
overlap as f64 / candidate_total as f64
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
let recall = if truth_total > 0 {
|
||
|
|
overlap as f64 / truth_total as f64
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
TokenScores::new(precision, recall)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Count non-empty lines in text (layout fidelity metric).
|
||
|
|
fn count_lines(text: &str) -> usize {
|
||
|
|
text.lines().filter(|line| !line.trim().is_empty()).count()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Calculate relative layout delta (0.0 = perfect, 1.0 = worst).
|
||
|
|
fn layout_delta(truth_lines: usize, ocr_lines: usize) -> f64 {
|
||
|
|
if truth_lines == 0 {
|
||
|
|
return if ocr_lines == 0 { 0.0 } else { 1.0 };
|
||
|
|
}
|
||
|
|
|
||
|
|
let delta = (ocr_lines as f64 - truth_lines as f64).abs() / truth_lines as f64;
|
||
|
|
delta.min(1.0)
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_simple_text_high_accuracy() {
|
||
|
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||
|
|
|
||
|
|
let truth_result =
|
||
|
|
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
truth_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
truth_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
ocr_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
println!("Truth content length: {}", truth_result.content.len());
|
||
|
|
println!("OCR content length: {}", ocr_result.content.len());
|
||
|
|
println!(
|
||
|
|
"Truth first 100 chars: {:?}",
|
||
|
|
&truth_result.content.chars().take(100).collect::<String>()
|
||
|
|
);
|
||
|
|
println!(
|
||
|
|
"OCR first 100 chars: {:?}",
|
||
|
|
&ocr_result.content.chars().take(100).collect::<String>()
|
||
|
|
);
|
||
|
|
|
||
|
|
let truth_tokens = tokenize_text(&truth_result.content);
|
||
|
|
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||
|
|
|
||
|
|
println!("Truth token count: {}", truth_tokens.len());
|
||
|
|
println!("OCR token count: {}", ocr_tokens.len());
|
||
|
|
|
||
|
|
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
|
||
|
|
|
||
|
|
println!("Simple text OCR quality:");
|
||
|
|
println!(" Precision: {:.3}", scores.precision);
|
||
|
|
println!(" Recall: {:.3}", scores.recall);
|
||
|
|
println!(" F1: {:.3}", scores.f1);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
scores.f1 >= 0.70,
|
||
|
|
"OCR F1 score too low: {:.3} (expected >= 0.70). Precision: {:.3}, Recall: {:.3}",
|
||
|
|
scores.f1,
|
||
|
|
scores.precision,
|
||
|
|
scores.recall
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_numeric_accuracy() {
|
||
|
|
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
|
||
|
|
|
||
|
|
let truth_result =
|
||
|
|
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
truth_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
truth_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
ocr_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let truth_tokens = tokenize_text(&truth_result.content);
|
||
|
|
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||
|
|
|
||
|
|
let truth_numeric = extract_numeric_tokens(&truth_tokens);
|
||
|
|
let ocr_numeric = extract_numeric_tokens(&ocr_tokens);
|
||
|
|
|
||
|
|
if !truth_numeric.is_empty() {
|
||
|
|
let numeric_scores = calculate_token_scores(&truth_numeric, &ocr_numeric);
|
||
|
|
|
||
|
|
println!("Numeric token OCR quality:");
|
||
|
|
println!(" Precision: {:.3}", numeric_scores.precision);
|
||
|
|
println!(" Recall: {:.3}", numeric_scores.recall);
|
||
|
|
println!(" F1: {:.3}", numeric_scores.f1);
|
||
|
|
println!(" Numeric tokens in truth: {}", truth_numeric.len());
|
||
|
|
println!(" Numeric tokens in OCR: {}", ocr_numeric.len());
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
numeric_scores.f1 >= 0.75,
|
||
|
|
"Numeric F1 score too low: {:.3} (expected >= 0.75). Numbers must be accurate!",
|
||
|
|
numeric_scores.f1
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_layout_preservation() {
|
||
|
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||
|
|
|
||
|
|
let truth_result =
|
||
|
|
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
truth_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
truth_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
ocr_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let truth_lines = count_lines(&truth_result.content);
|
||
|
|
let ocr_lines = count_lines(&ocr_result.content);
|
||
|
|
let delta = layout_delta(truth_lines, ocr_lines);
|
||
|
|
|
||
|
|
println!("Layout preservation:");
|
||
|
|
println!(" Truth lines: {}", truth_lines);
|
||
|
|
println!(" OCR lines: {}", ocr_lines);
|
||
|
|
println!(" Layout delta: {:.3}", delta);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
delta <= 0.40,
|
||
|
|
"Layout delta too high: {:.3} (expected <= 0.40). Truth: {} lines, OCR: {} lines",
|
||
|
|
delta,
|
||
|
|
truth_lines,
|
||
|
|
ocr_lines
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_technical_document() {
|
||
|
|
if skip_if_missing("pdfs/code_and_formula.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/code_and_formula.pdf");
|
||
|
|
|
||
|
|
let truth_result =
|
||
|
|
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
truth_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
truth_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
ocr_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let truth_tokens = tokenize_text(&truth_result.content);
|
||
|
|
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||
|
|
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
|
||
|
|
|
||
|
|
println!("Technical document OCR quality:");
|
||
|
|
println!(" Precision: {:.3}", scores.precision);
|
||
|
|
println!(" Recall: {:.3}", scores.recall);
|
||
|
|
println!(" F1: {:.3}", scores.f1);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
scores.f1 >= 0.60,
|
||
|
|
"Technical document F1 score too low: {:.3} (expected >= 0.60)",
|
||
|
|
scores.f1
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_consistency_across_runs() {
|
||
|
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result1 = extract_file_sync(&file_path, None, &ocr_config).expect("First OCR run should succeed");
|
||
|
|
let result2 = extract_file_sync(&file_path, None, &ocr_config).expect("Second OCR run should succeed");
|
||
|
|
let result3 = extract_file_sync(&file_path, None, &ocr_config).expect("Third OCR run should succeed");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
result1.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(result1.detected_languages.is_none(), "Language detection not enabled");
|
||
|
|
assert!(
|
||
|
|
result2.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
|
||
|
|
assert!(
|
||
|
|
result3.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(result3.detected_languages.is_none(), "Language detection not enabled");
|
||
|
|
|
||
|
|
let tokens1 = tokenize_text(&result1.content);
|
||
|
|
let tokens2 = tokenize_text(&result2.content);
|
||
|
|
let tokens3 = tokenize_text(&result3.content);
|
||
|
|
|
||
|
|
let scores_1_2 = calculate_token_scores(&tokens1, &tokens2);
|
||
|
|
let scores_1_3 = calculate_token_scores(&tokens1, &tokens3);
|
||
|
|
|
||
|
|
println!("OCR consistency across runs:");
|
||
|
|
println!(" Run1 vs Run2 F1: {:.3}", scores_1_2.f1);
|
||
|
|
println!(" Run1 vs Run3 F1: {:.3}", scores_1_3.f1);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
scores_1_2.f1 >= 0.98,
|
||
|
|
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
|
||
|
|
scores_1_2.f1
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
scores_1_3.f1 >= 0.98,
|
||
|
|
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
|
||
|
|
scores_1_3.f1
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_consistency_with_different_psm() {
|
||
|
|
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||
|
|
|
||
|
|
let config_psm3 = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||
|
|
psm: 3,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let config_psm6 = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||
|
|
psm: 6,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result_psm3 = extract_file_sync(&file_path, None, &config_psm3).expect("PSM 3 extraction should succeed");
|
||
|
|
let result_psm6 = extract_file_sync(&file_path, None, &config_psm6).expect("PSM 6 extraction should succeed");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
result_psm3.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
result_psm3.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
result_psm6.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
result_psm6.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let tokens_psm3 = tokenize_text(&result_psm3.content);
|
||
|
|
let tokens_psm6 = tokenize_text(&result_psm6.content);
|
||
|
|
|
||
|
|
let scores = calculate_token_scores(&tokens_psm3, &tokens_psm6);
|
||
|
|
|
||
|
|
println!("OCR consistency across PSM modes:");
|
||
|
|
println!(" PSM 3 vs PSM 6 F1: {:.3}", scores.f1);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
scores.f1 >= 0.85,
|
||
|
|
"PSM modes produce too different results: F1 {:.3} (expected >= 0.85)",
|
||
|
|
scores.f1
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_multi_page_consistency() {
|
||
|
|
if skip_if_missing("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if std::env::var_os("KREUZBERG_RUN_FULL_OCR").is_none() {
|
||
|
|
println!("Skipping test_ocr_quality_multi_page_consistency: set KREUZBERG_RUN_FULL_OCR=1 to enable");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf");
|
||
|
|
|
||
|
|
let truth_result =
|
||
|
|
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
truth_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
truth_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_result.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
ocr_result.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
|
||
|
|
let truth_tokens = tokenize_text(&truth_result.content);
|
||
|
|
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||
|
|
|
||
|
|
let truth_count: usize = truth_tokens.values().sum();
|
||
|
|
let ocr_count: usize = ocr_tokens.values().sum();
|
||
|
|
|
||
|
|
println!("Multi-page document quality:");
|
||
|
|
println!(" Truth token count: {}", truth_count);
|
||
|
|
println!(" OCR token count: {}", ocr_count);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
ocr_count >= (truth_count * 50 / 100),
|
||
|
|
"OCR extracted too few tokens: {} (expected >= 50% of {})",
|
||
|
|
ocr_count,
|
||
|
|
truth_count
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_quality_with_tables() {
|
||
|
|
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
|
||
|
|
|
||
|
|
let ocr_config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||
|
|
enable_table_detection: true,
|
||
|
|
table_min_confidence: 0.5,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with table detection");
|
||
|
|
|
||
|
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
||
|
|
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
||
|
|
|
||
|
|
println!("Table extraction quality:");
|
||
|
|
println!(" Tables found: {}", result.tables.len());
|
||
|
|
println!(" Content length: {}", result.content.len());
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!result.content.trim().is_empty(),
|
||
|
|
"OCR with tables should produce content"
|
||
|
|
);
|
||
|
|
}
|