This commit is contained in:
632
crates/kreuzberg/tests/ocr_quality.rs
Normal file
632
crates/kreuzberg/tests/ocr_quality.rs
Normal file
@@ -0,0 +1,632 @@
|
||||
//! OCR quality assessment tests.
|
||||
//!
|
||||
//! This module tests OCR quality by comparing against ground truth (native PDF text layers).
|
||||
//! Adopts techniques from scripts/ocr_quality_report.py:
|
||||
//! - Token-based precision/recall/F1 scoring
|
||||
//! - Numeric accuracy tracking (critical for tables, data)
|
||||
//! - Layout fidelity (line count preservation)
|
||||
//! - Markdown structure preservation
|
||||
//!
|
||||
//! Test philosophy:
|
||||
//! - Compare OCR output against searchable PDF text (ground truth)
|
||||
//! - Measure accuracy with precision, recall, F1 metrics
|
||||
//! - Track numeric token accuracy separately (higher importance)
|
||||
//! - Verify layout preservation (line counts, structure)
|
||||
//! - Assert minimum quality thresholds
|
||||
|
||||
#![cfg(all(feature = "ocr", feature = "pdf"))]
|
||||
|
||||
mod helpers;
|
||||
|
||||
use helpers::*;
|
||||
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
||||
use kreuzberg::extract_file_sync;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct TokenScores {
|
||||
precision: f64,
|
||||
recall: f64,
|
||||
f1: f64,
|
||||
}
|
||||
|
||||
impl TokenScores {
|
||||
fn new(precision: f64, recall: f64) -> Self {
|
||||
let f1 = if precision + recall == 0.0 {
|
||||
0.0
|
||||
} else {
|
||||
2.0 * precision * recall / (precision + recall)
|
||||
};
|
||||
Self { precision, recall, f1 }
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenize and normalize text for comparison.
|
||||
/// Matches Python implementation: lowercase, normalize dashes, remove punctuation.
|
||||
fn tokenize_text(text: &str) -> HashMap<String, usize> {
|
||||
let normalized = text
|
||||
.to_lowercase()
|
||||
.replace(['\u{2013}', '\u{2014}'], "-")
|
||||
.chars()
|
||||
.map(|ch| {
|
||||
if ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t' {
|
||||
ch
|
||||
} else {
|
||||
' '
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
let normalized = normalized
|
||||
.chars()
|
||||
.map(|ch| if "()[],.;:+`".contains(ch) { ' ' } else { ch })
|
||||
.collect::<String>();
|
||||
|
||||
let mut tokens: HashMap<String, usize> = HashMap::new();
|
||||
for token in normalized.split_whitespace() {
|
||||
*tokens.entry(token.to_string()).or_insert(0) += 1;
|
||||
}
|
||||
tokens
|
||||
}
|
||||
|
||||
/// Extract numeric tokens from token map.
|
||||
/// Critical for measuring accuracy on tables, data, figures.
|
||||
fn extract_numeric_tokens(tokens: &HashMap<String, usize>) -> HashMap<String, usize> {
|
||||
let mut numeric_tokens: HashMap<String, usize> = HashMap::new();
|
||||
|
||||
for (token, count) in tokens {
|
||||
let stripped = token.trim_matches(|c: char| "()[]{}".contains(c));
|
||||
|
||||
if !stripped.chars().any(|ch| ch.is_ascii_digit()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if stripped.chars().any(|ch| ch.is_ascii_alphabetic()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
*numeric_tokens.entry(stripped.to_string()).or_insert(0) += count;
|
||||
}
|
||||
|
||||
numeric_tokens
|
||||
}
|
||||
|
||||
/// Calculate precision, recall, F1 for token sets.
|
||||
fn calculate_token_scores(
|
||||
truth_tokens: &HashMap<String, usize>,
|
||||
candidate_tokens: &HashMap<String, usize>,
|
||||
) -> TokenScores {
|
||||
let truth_total: usize = truth_tokens.values().sum();
|
||||
let candidate_total: usize = candidate_tokens.values().sum();
|
||||
|
||||
if truth_total == 0 && candidate_total == 0 {
|
||||
return TokenScores::new(1.0, 1.0);
|
||||
}
|
||||
|
||||
let overlap: usize = truth_tokens
|
||||
.keys()
|
||||
.map(|token| {
|
||||
let truth_count = truth_tokens.get(token).unwrap_or(&0);
|
||||
let candidate_count = candidate_tokens.get(token).unwrap_or(&0);
|
||||
truth_count.min(candidate_count)
|
||||
})
|
||||
.sum();
|
||||
|
||||
let precision = if candidate_total > 0 {
|
||||
overlap as f64 / candidate_total as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let recall = if truth_total > 0 {
|
||||
overlap as f64 / truth_total as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
TokenScores::new(precision, recall)
|
||||
}
|
||||
|
||||
/// Count non-empty lines in text (layout fidelity metric).
|
||||
fn count_lines(text: &str) -> usize {
|
||||
text.lines().filter(|line| !line.trim().is_empty()).count()
|
||||
}
|
||||
|
||||
/// Calculate relative layout delta (0.0 = perfect, 1.0 = worst).
|
||||
fn layout_delta(truth_lines: usize, ocr_lines: usize) -> f64 {
|
||||
if truth_lines == 0 {
|
||||
return if ocr_lines == 0 { 0.0 } else { 1.0 };
|
||||
}
|
||||
|
||||
let delta = (ocr_lines as f64 - truth_lines as f64).abs() / truth_lines as f64;
|
||||
delta.min(1.0)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_simple_text_high_accuracy() {
|
||||
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||||
|
||||
let truth_result =
|
||||
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||||
|
||||
assert!(
|
||||
truth_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
truth_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||||
|
||||
assert!(
|
||||
ocr_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
ocr_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
println!("Truth content length: {}", truth_result.content.len());
|
||||
println!("OCR content length: {}", ocr_result.content.len());
|
||||
println!(
|
||||
"Truth first 100 chars: {:?}",
|
||||
&truth_result.content.chars().take(100).collect::<String>()
|
||||
);
|
||||
println!(
|
||||
"OCR first 100 chars: {:?}",
|
||||
&ocr_result.content.chars().take(100).collect::<String>()
|
||||
);
|
||||
|
||||
let truth_tokens = tokenize_text(&truth_result.content);
|
||||
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||||
|
||||
println!("Truth token count: {}", truth_tokens.len());
|
||||
println!("OCR token count: {}", ocr_tokens.len());
|
||||
|
||||
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
|
||||
|
||||
println!("Simple text OCR quality:");
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
|
||||
assert!(
|
||||
scores.f1 >= 0.70,
|
||||
"OCR F1 score too low: {:.3} (expected >= 0.70). Precision: {:.3}, Recall: {:.3}",
|
||||
scores.f1,
|
||||
scores.precision,
|
||||
scores.recall
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_numeric_accuracy() {
|
||||
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
|
||||
|
||||
let truth_result =
|
||||
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||||
|
||||
assert!(
|
||||
truth_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
truth_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||||
|
||||
assert!(
|
||||
ocr_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
ocr_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let truth_tokens = tokenize_text(&truth_result.content);
|
||||
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||||
|
||||
let truth_numeric = extract_numeric_tokens(&truth_tokens);
|
||||
let ocr_numeric = extract_numeric_tokens(&ocr_tokens);
|
||||
|
||||
if !truth_numeric.is_empty() {
|
||||
let numeric_scores = calculate_token_scores(&truth_numeric, &ocr_numeric);
|
||||
|
||||
println!("Numeric token OCR quality:");
|
||||
println!(" Precision: {:.3}", numeric_scores.precision);
|
||||
println!(" Recall: {:.3}", numeric_scores.recall);
|
||||
println!(" F1: {:.3}", numeric_scores.f1);
|
||||
println!(" Numeric tokens in truth: {}", truth_numeric.len());
|
||||
println!(" Numeric tokens in OCR: {}", ocr_numeric.len());
|
||||
|
||||
assert!(
|
||||
numeric_scores.f1 >= 0.75,
|
||||
"Numeric F1 score too low: {:.3} (expected >= 0.75). Numbers must be accurate!",
|
||||
numeric_scores.f1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_layout_preservation() {
|
||||
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||||
|
||||
let truth_result =
|
||||
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||||
|
||||
assert!(
|
||||
truth_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
truth_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||||
|
||||
assert!(
|
||||
ocr_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
ocr_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let truth_lines = count_lines(&truth_result.content);
|
||||
let ocr_lines = count_lines(&ocr_result.content);
|
||||
let delta = layout_delta(truth_lines, ocr_lines);
|
||||
|
||||
println!("Layout preservation:");
|
||||
println!(" Truth lines: {}", truth_lines);
|
||||
println!(" OCR lines: {}", ocr_lines);
|
||||
println!(" Layout delta: {:.3}", delta);
|
||||
|
||||
assert!(
|
||||
delta <= 0.40,
|
||||
"Layout delta too high: {:.3} (expected <= 0.40). Truth: {} lines, OCR: {} lines",
|
||||
delta,
|
||||
truth_lines,
|
||||
ocr_lines
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_technical_document() {
|
||||
if skip_if_missing("pdfs/code_and_formula.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/code_and_formula.pdf");
|
||||
|
||||
let truth_result =
|
||||
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||||
|
||||
assert!(
|
||||
truth_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
truth_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||||
|
||||
assert!(
|
||||
ocr_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
ocr_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let truth_tokens = tokenize_text(&truth_result.content);
|
||||
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||||
let scores = calculate_token_scores(&truth_tokens, &ocr_tokens);
|
||||
|
||||
println!("Technical document OCR quality:");
|
||||
println!(" Precision: {:.3}", scores.precision);
|
||||
println!(" Recall: {:.3}", scores.recall);
|
||||
println!(" F1: {:.3}", scores.f1);
|
||||
|
||||
assert!(
|
||||
scores.f1 >= 0.60,
|
||||
"Technical document F1 score too low: {:.3} (expected >= 0.60)",
|
||||
scores.f1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_consistency_across_runs() {
|
||||
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
use_cache: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result1 = extract_file_sync(&file_path, None, &ocr_config).expect("First OCR run should succeed");
|
||||
let result2 = extract_file_sync(&file_path, None, &ocr_config).expect("Second OCR run should succeed");
|
||||
let result3 = extract_file_sync(&file_path, None, &ocr_config).expect("Third OCR run should succeed");
|
||||
|
||||
assert!(
|
||||
result1.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(result1.detected_languages.is_none(), "Language detection not enabled");
|
||||
assert!(
|
||||
result2.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
|
||||
assert!(
|
||||
result3.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(result3.detected_languages.is_none(), "Language detection not enabled");
|
||||
|
||||
let tokens1 = tokenize_text(&result1.content);
|
||||
let tokens2 = tokenize_text(&result2.content);
|
||||
let tokens3 = tokenize_text(&result3.content);
|
||||
|
||||
let scores_1_2 = calculate_token_scores(&tokens1, &tokens2);
|
||||
let scores_1_3 = calculate_token_scores(&tokens1, &tokens3);
|
||||
|
||||
println!("OCR consistency across runs:");
|
||||
println!(" Run1 vs Run2 F1: {:.3}", scores_1_2.f1);
|
||||
println!(" Run1 vs Run3 F1: {:.3}", scores_1_3.f1);
|
||||
|
||||
assert!(
|
||||
scores_1_2.f1 >= 0.98,
|
||||
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
|
||||
scores_1_2.f1
|
||||
);
|
||||
assert!(
|
||||
scores_1_3.f1 >= 0.98,
|
||||
"OCR inconsistent between runs: F1 {:.3} (expected >= 0.98)",
|
||||
scores_1_3.f1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_consistency_with_different_psm() {
|
||||
if skip_if_missing("pdfs/fake_memo.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
||||
|
||||
let config_psm3 = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||||
psm: 3,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let config_psm6 = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||||
psm: 6,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result_psm3 = extract_file_sync(&file_path, None, &config_psm3).expect("PSM 3 extraction should succeed");
|
||||
let result_psm6 = extract_file_sync(&file_path, None, &config_psm6).expect("PSM 6 extraction should succeed");
|
||||
|
||||
assert!(
|
||||
result_psm3.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
result_psm3.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
assert!(
|
||||
result_psm6.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
result_psm6.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let tokens_psm3 = tokenize_text(&result_psm3.content);
|
||||
let tokens_psm6 = tokenize_text(&result_psm6.content);
|
||||
|
||||
let scores = calculate_token_scores(&tokens_psm3, &tokens_psm6);
|
||||
|
||||
println!("OCR consistency across PSM modes:");
|
||||
println!(" PSM 3 vs PSM 6 F1: {:.3}", scores.f1);
|
||||
|
||||
assert!(
|
||||
scores.f1 >= 0.85,
|
||||
"PSM modes produce too different results: F1 {:.3} (expected >= 0.85)",
|
||||
scores.f1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_multi_page_consistency() {
|
||||
if skip_if_missing("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
if std::env::var_os("KREUZBERG_RUN_FULL_OCR").is_none() {
|
||||
println!("Skipping test_ocr_quality_multi_page_consistency: set KREUZBERG_RUN_FULL_OCR=1 to enable");
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/a_course_in_machine_learning_ciml_v0_9_all.pdf");
|
||||
|
||||
let truth_result =
|
||||
extract_file_sync(&file_path, None, &ExtractionConfig::default()).expect("Should extract ground truth text");
|
||||
|
||||
assert!(
|
||||
truth_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
truth_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ocr_result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with OCR");
|
||||
|
||||
assert!(
|
||||
ocr_result.chunks.is_none(),
|
||||
"Chunks should be None without chunking config"
|
||||
);
|
||||
assert!(
|
||||
ocr_result.detected_languages.is_none(),
|
||||
"Language detection not enabled"
|
||||
);
|
||||
|
||||
let truth_tokens = tokenize_text(&truth_result.content);
|
||||
let ocr_tokens = tokenize_text(&ocr_result.content);
|
||||
|
||||
let truth_count: usize = truth_tokens.values().sum();
|
||||
let ocr_count: usize = ocr_tokens.values().sum();
|
||||
|
||||
println!("Multi-page document quality:");
|
||||
println!(" Truth token count: {}", truth_count);
|
||||
println!(" OCR token count: {}", ocr_count);
|
||||
|
||||
assert!(
|
||||
ocr_count >= (truth_count * 50 / 100),
|
||||
"OCR extracted too few tokens: {} (expected >= 50% of {})",
|
||||
ocr_count,
|
||||
truth_count
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_quality_with_tables() {
|
||||
if skip_if_missing("pdfs/embedded_images_tables.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let file_path = get_test_file_path("pdfs/embedded_images_tables.pdf");
|
||||
|
||||
let ocr_config = ExtractionConfig {
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
tesseract_config: Some(kreuzberg::types::TesseractConfig {
|
||||
enable_table_detection: true,
|
||||
table_min_confidence: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync(&file_path, None, &ocr_config).expect("Should extract with table detection");
|
||||
|
||||
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
||||
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
||||
|
||||
println!("Table extraction quality:");
|
||||
println!(" Tables found: {}", result.tables.len());
|
||||
println!(" Content length: {}", result.content.len());
|
||||
|
||||
assert!(
|
||||
!result.content.trim().is_empty(),
|
||||
"OCR with tables should produce content"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user