This commit is contained in:
406
crates/kreuzberg/tests/pdf_table_ground_truth.rs
Normal file
406
crates/kreuzberg/tests/pdf_table_ground_truth.rs
Normal file
@@ -0,0 +1,406 @@
|
||||
//! Ground truth-based PDF table detection and markdown quality tests.
|
||||
//!
|
||||
//! These tests establish baselines for table detection and markdown output quality.
|
||||
//! Run after each substantial change to measure improvement or regression.
|
||||
//!
|
||||
//! Usage:
|
||||
//! # Non-OCR tests (fast, oxide path):
|
||||
//! cargo test -p kreuzberg --features "pdf" --test pdf_table_ground_truth -- --nocapture
|
||||
//!
|
||||
//! # Full tests including table detection (needs ocr feature for HocrWord):
|
||||
//! cargo test -p kreuzberg --features "pdf,ocr" --test pdf_table_ground_truth -- --nocapture
|
||||
//!
|
||||
//! # Comprehensive baseline snapshot:
|
||||
//! cargo test -p kreuzberg --features "pdf,ocr" --test pdf_table_ground_truth -- --ignored --nocapture
|
||||
|
||||
#![cfg(feature = "pdf")]
|
||||
|
||||
mod helpers;
|
||||
|
||||
use helpers::*;
|
||||
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
/// Compute word-level Jaccard similarity between two strings.
|
||||
fn word_similarity(a: &str, b: &str) -> f64 {
|
||||
let words_a: std::collections::HashSet<&str> = a.split_whitespace().collect();
|
||||
let words_b: std::collections::HashSet<&str> = b.split_whitespace().collect();
|
||||
|
||||
if words_a.is_empty() && words_b.is_empty() {
|
||||
return 1.0;
|
||||
}
|
||||
if words_a.is_empty() || words_b.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let intersection = words_a.intersection(&words_b).count();
|
||||
let union = words_a.union(&words_b).count();
|
||||
|
||||
intersection as f64 / union as f64
|
||||
}
|
||||
|
||||
/// Extract markdown from a PDF file (oxide path, no OCR).
|
||||
fn extract_markdown(relative_path: &str) -> Option<kreuzberg::types::ExtractionResult> {
|
||||
let path = get_test_file_path(relative_path);
|
||||
if !path.exists() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
extract_file_sync(&path, None, &config).ok()
|
||||
}
|
||||
|
||||
fn print_table_summary(result: &kreuzberg::types::ExtractionResult) {
|
||||
println!(" Tables detected: {}", result.tables.len());
|
||||
println!(" Content length: {} chars", result.content.len());
|
||||
for (i, table) in result.tables.iter().enumerate() {
|
||||
let rows = table.cells.len();
|
||||
let cols = if rows > 0 { table.cells[0].len() } else { 0 };
|
||||
println!(" Table {}: {}x{} (page {})", i + 1, rows, cols, table.page_number);
|
||||
if let Some(first_row) = table.cells.first() {
|
||||
let preview: Vec<String> = first_row
|
||||
.iter()
|
||||
.take(3)
|
||||
.map(|c| {
|
||||
let s = c.trim();
|
||||
if s.len() > 40 {
|
||||
format!("{}...", &s[..s.floor_char_boundary(40)])
|
||||
} else {
|
||||
s.to_string()
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
println!(" First row: {:?}", preview);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Section 1: False Positive Regression Tests
|
||||
// Non-table PDFs must NOT have tables detected.
|
||||
// These are the hard gate — they must pass for a commit.
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
/// Helper to run a false-positive check for a non-table PDF.
|
||||
/// Only checks when the ocr feature is enabled (table detection requires it).
|
||||
#[cfg(feature = "ocr")]
|
||||
fn assert_no_tables(pdf_name: &str) {
|
||||
let rel = format!("pdf/{}", pdf_name);
|
||||
if skip_if_missing(&rel) {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = extract_markdown(&rel).expect("extraction should succeed");
|
||||
|
||||
println!("=== {} false positive check ===", pdf_name);
|
||||
print_table_summary(&result);
|
||||
|
||||
assert!(
|
||||
result.tables.is_empty(),
|
||||
"{} should not have tables detected (got {})",
|
||||
pdf_name,
|
||||
result.tables.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[ignore = "TODO: pdf_oxide upstream — table detector false-positive driven by pdf_oxide span geometry; https://github.com/yfedoseev/pdf_oxide/issues/484"]
|
||||
#[test]
|
||||
fn test_false_positive_simple_pdf() {
|
||||
assert_no_tables("simple.pdf");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[test]
|
||||
fn test_false_positive_fake_memo() {
|
||||
assert_no_tables("fake_memo.pdf");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[test]
|
||||
fn test_false_positive_searchable() {
|
||||
assert_no_tables("searchable.pdf");
|
||||
}
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[ignore = "TODO: pdf_oxide upstream — table detector false-positive driven by pdf_oxide span geometry; https://github.com/yfedoseev/pdf_oxide/issues/484"]
|
||||
#[test]
|
||||
fn test_false_positive_google_doc() {
|
||||
assert_no_tables("google_doc_document.pdf");
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Section 2: Markdown Quality Tests (oxide path, no OCR needed)
|
||||
// Tests that text-bearing PDFs produce reasonable markdown.
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
#[test]
|
||||
fn test_markdown_quality_fake_memo() {
|
||||
if skip_if_missing("pdf/fake_memo.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = extract_markdown("pdf/fake_memo.pdf").expect("extraction should succeed");
|
||||
|
||||
println!("=== fake_memo.pdf markdown quality ===");
|
||||
println!("Content length: {} chars", result.content.len());
|
||||
|
||||
assert!(
|
||||
result.content.len() > 100,
|
||||
"fake_memo.pdf should produce >100 chars of markdown (got {})",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_quality_simple() {
|
||||
if skip_if_missing("pdf/simple.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = extract_markdown("pdf/simple.pdf").expect("extraction should succeed");
|
||||
|
||||
println!("=== simple.pdf markdown quality ===");
|
||||
println!("Content length: {} chars", result.content.len());
|
||||
|
||||
assert!(
|
||||
result.content.len() > 1000,
|
||||
"simple.pdf should produce >1000 chars of markdown (got {})",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_quality_multi_page() {
|
||||
if skip_if_missing("pdf/multi_page.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let result = extract_markdown("pdf/multi_page.pdf").expect("extraction should succeed");
|
||||
|
||||
println!("=== multi_page.pdf markdown quality ===");
|
||||
println!("Content length: {} chars", result.content.len());
|
||||
|
||||
// multi_page.pdf is text-based and should produce substantial markdown
|
||||
assert!(
|
||||
result.content.len() > 5000,
|
||||
"multi_page.pdf should produce >5000 chars (got {})",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_quality_vs_ground_truth_simple() {
|
||||
if skip_if_missing("pdf/table_document.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let gt_path = get_test_file_path("ground_truth/pdf/pdf_tables.txt");
|
||||
if !gt_path.exists() {
|
||||
println!("Skipping: ground truth file not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let ground_truth = std::fs::read_to_string(>_path).expect("should read ground truth");
|
||||
let result = extract_markdown("pdf/table_document.pdf").expect("extraction should succeed");
|
||||
|
||||
let similarity = word_similarity(&result.content, &ground_truth);
|
||||
|
||||
println!("=== table_document.pdf vs ground truth ===");
|
||||
println!("Extraction length: {} chars", result.content.len());
|
||||
println!("Ground truth length: {} chars", ground_truth.len());
|
||||
println!("Word similarity: {:.1}%", similarity * 100.0);
|
||||
|
||||
// table_document.pdf is image-only, so the PDF extractor finds almost no text.
|
||||
// This test tracks progress — similarity should increase with OCR improvements.
|
||||
// Currently: ~3% (only image placeholder matches a few words).
|
||||
println!("NOTE: table_document.pdf is image-only; low similarity expected without OCR.");
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Section 3: OCR Path Table Detection (slow, run with --ignored)
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
#[test]
|
||||
#[ignore] // Slow OCR tests, run explicitly
|
||||
fn test_ocr_path_table_document() {
|
||||
use kreuzberg::core::config::OcrConfig;
|
||||
|
||||
if skip_if_missing("pdf/table_document.pdf") {
|
||||
return;
|
||||
}
|
||||
|
||||
let path = get_test_file_path("pdf/table_document.pdf");
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown,
|
||||
ocr: Some(OcrConfig {
|
||||
backend: "tesseract".to_string(),
|
||||
language: "eng".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
force_ocr: true, // Force OCR since this is image-only
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync(&path, None, &config).expect("extraction should succeed");
|
||||
|
||||
println!("=== table_document.pdf (forced OCR path) ===");
|
||||
print_table_summary(&result);
|
||||
println!("\n--- Content (first 2000 chars) ---");
|
||||
println!("{}", &result.content[..result.content.len().min(2000)]);
|
||||
|
||||
assert!(
|
||||
result.content.len() > 100,
|
||||
"table_document.pdf OCR path should produce substantial content (got {})",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
// Section 4: Comprehensive Baseline Snapshot
|
||||
// Full scan of all PDFs — run with --ignored for complete picture.
|
||||
// ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_comprehensive_table_detection_baseline() {
|
||||
if !test_documents_available() {
|
||||
println!("Skipping: test_documents not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let image_table_pdfs = [
|
||||
"table_document.pdf",
|
||||
"multi_page_tables.pdf",
|
||||
"embedded_images_tables.pdf",
|
||||
];
|
||||
|
||||
let text_table_pdfs = [
|
||||
"multi_page.pdf",
|
||||
"medium.pdf",
|
||||
"large.pdf",
|
||||
"a_comparison_of_programming_languages_in_economics_16_jun_2014.pdf",
|
||||
"tiny.pdf",
|
||||
"tatr.pdf",
|
||||
];
|
||||
|
||||
let non_table_pdfs = [
|
||||
"simple.pdf",
|
||||
"fake_memo.pdf",
|
||||
"google_doc_document.pdf",
|
||||
"searchable.pdf",
|
||||
"test_article.pdf",
|
||||
"code_and_formula.pdf",
|
||||
];
|
||||
|
||||
println!("\n╔══════════════════════════════════════════════════╗");
|
||||
println!("║ Table Detection Baseline Snapshot ║");
|
||||
println!("╚══════════════════════════════════════════════════╝\n");
|
||||
|
||||
println!("--- Image-Only Table PDFs (need OCR) ---");
|
||||
for pdf in &image_table_pdfs {
|
||||
let rel = format!("pdf/{}", pdf);
|
||||
if skip_if_missing(&rel) {
|
||||
continue;
|
||||
}
|
||||
match extract_markdown(&rel) {
|
||||
Some(result) => {
|
||||
let table_count = result.tables.len();
|
||||
let status = if result.content.len() < 50 { "IMG" } else { "OK" };
|
||||
println!(
|
||||
" [{:4}] {:<55} tables={} md_len={}",
|
||||
status,
|
||||
pdf,
|
||||
table_count,
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
None => println!(" [ERR ] {}", pdf),
|
||||
}
|
||||
}
|
||||
|
||||
println!("\n--- Text-Based PDFs Expected to Have Tables ---");
|
||||
let mut true_positives = 0;
|
||||
let mut false_negatives = 0;
|
||||
for pdf in &text_table_pdfs {
|
||||
let rel = format!("pdf/{}", pdf);
|
||||
if skip_if_missing(&rel) {
|
||||
continue;
|
||||
}
|
||||
match extract_markdown(&rel) {
|
||||
Some(result) => {
|
||||
let table_count = result.tables.len();
|
||||
let status = if table_count > 0 {
|
||||
true_positives += 1;
|
||||
"OK"
|
||||
} else {
|
||||
false_negatives += 1;
|
||||
"MISS"
|
||||
};
|
||||
println!(
|
||||
" [{:4}] {:<55} tables={} md_len={}",
|
||||
status,
|
||||
pdf,
|
||||
table_count,
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
None => println!(" [ERR ] {}", pdf),
|
||||
}
|
||||
}
|
||||
|
||||
println!("\n--- Expected Non-Table PDFs ---");
|
||||
let mut true_negatives = 0;
|
||||
let mut false_positives = 0;
|
||||
for pdf in &non_table_pdfs {
|
||||
let rel = format!("pdf/{}", pdf);
|
||||
if skip_if_missing(&rel) {
|
||||
continue;
|
||||
}
|
||||
match extract_markdown(&rel) {
|
||||
Some(result) => {
|
||||
let table_count = result.tables.len();
|
||||
let status = if table_count == 0 {
|
||||
true_negatives += 1;
|
||||
"OK"
|
||||
} else {
|
||||
false_positives += 1;
|
||||
"FP"
|
||||
};
|
||||
println!(
|
||||
" [{:4}] {:<55} tables={} md_len={}",
|
||||
status,
|
||||
pdf,
|
||||
table_count,
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
None => println!(" [ERR ] {}", pdf),
|
||||
}
|
||||
}
|
||||
|
||||
println!("\n--- Summary ---");
|
||||
println!("True positives: {}", true_positives);
|
||||
println!("False negatives: {}", false_negatives);
|
||||
println!("True negatives: {}", true_negatives);
|
||||
println!("False positives: {}", false_positives);
|
||||
|
||||
let precision = if true_positives + false_positives > 0 {
|
||||
true_positives as f64 / (true_positives + false_positives) as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let recall = if true_positives + false_negatives > 0 {
|
||||
true_positives as f64 / (true_positives + false_negatives) as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
println!("Precision: {:.1}%", precision * 100.0);
|
||||
println!("Recall: {:.1}%", recall * 100.0);
|
||||
}
|
||||
Reference in New Issue
Block a user