284 lines
11 KiB
Rust
284 lines
11 KiB
Rust
//! PDF table detection comprehensive test.
|
||
//!
|
||
//! This test file analyzes table detection across all PDF test documents
|
||
//! to understand the current state of table detection and identify false positives.
|
||
//!
|
||
//! Run with:
|
||
//! cargo test --features pdf,ocr --test pdf_table_detection -- --ignored --nocapture 2>&1 | head -1000
|
||
//!
|
||
//! This will extract tables from all PDFs and log:
|
||
//! - Filename
|
||
//! - Number of tables detected
|
||
//! - Dimensions of each table (rows x cols)
|
||
//! - First 3 cells of each table (to verify legitimacy)
|
||
|
||
#![cfg(feature = "pdf")]
|
||
|
||
mod helpers;
|
||
|
||
use helpers::*;
|
||
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
||
use kreuzberg::extract_file_sync;
|
||
|
||
/// All PDF filenames in test_documents/pdf/.
|
||
#[allow(dead_code)]
|
||
const ALL_PDFS: &[&str] = &[
|
||
"100_g_networking_technology_overview_slides_toronto_august_2016.pdf",
|
||
"5_level_paging_and_5_level_ept_intel_revision_1_1_may_2017.pdf",
|
||
"a_brief_introduction_to_neural_networks_neuronalenetze_en_zeta2_2col_dkrieselcom.pdf",
|
||
"a_brief_introduction_to_the_standard_annotation_language_sal_2006.pdf",
|
||
"a_catalogue_of_optimizing_transformations_1971_allen_catalog.pdf",
|
||
"a_comparison_of_programming_languages_in_economics_16_jun_2014.pdf",
|
||
"a_comprehensive_study_of_convergent_and_commutative_replicated_data_types.pdf",
|
||
"a_comprehensive_study_of_main_memory_partitioning_and_its_application_to_large_scale_comparison_and_radix_sort_sigmod14_i.pdf",
|
||
"a_course_in_machine_learning_ciml_v0_9_all.pdf",
|
||
"algebra_topology_differential_calculus_and_optimization_theory_for_computer_science_and_machine_learning_2019_math_deep.pdf",
|
||
"an_introduction_to_statistical_learning_with_applications_in_r_islr_sixth_printing.pdf",
|
||
"assembly_language_for_beginners_al4_b_en.pdf",
|
||
"bayesian_data_analysis_third_edition_13th_feb_2020.pdf",
|
||
"code_and_formula.pdf",
|
||
"copy_protected.pdf",
|
||
"embedded_images_tables.pdf",
|
||
"fake_memo.pdf",
|
||
"fundamentals_of_deep_learning_2014.pdf",
|
||
"google_doc_document.pdf",
|
||
"image_only_german_pdf.pdf",
|
||
"intel_64_and_ia_32_architectures_software_developer_s_manual_combined_volumes_1_4_june_2021_325462_sdm_vol_1_2abcd_3abcd.pdf",
|
||
"large.pdf",
|
||
"medium.pdf",
|
||
"multi_page_tables.pdf",
|
||
"multi_page.pdf",
|
||
"non_ascii_text.pdf",
|
||
"non_searchable.pdf",
|
||
"ocr_test_rotated_180.pdf",
|
||
"ocr_test_rotated_270.pdf",
|
||
"ocr_test_rotated_90.pdf",
|
||
"ocr_test.pdf",
|
||
"password_protected.pdf",
|
||
"perfect_hash_functions_slides.pdf",
|
||
"program_design_in_the_unix_environment.pdf",
|
||
"proof_of_concept_or_gtfo_v13_october_18th_2016.pdf",
|
||
"right_to_left_01.pdf",
|
||
"sample_contract.pdf",
|
||
"scanned.pdf",
|
||
"searchable.pdf",
|
||
"sharable_web_guide.pdf",
|
||
"simple.pdf",
|
||
"table_document.pdf",
|
||
"tatr.pdf",
|
||
"test_article.pdf",
|
||
"the_hideous_name_1985_pike85hideous.pdf",
|
||
"tiny.pdf",
|
||
"with_images.pdf",
|
||
"xerox_alta_link_series_mfp_sag_en_us_2.pdf",
|
||
];
|
||
|
||
/// Format cell content for display (truncate long text)
|
||
fn format_cell(cell: &str) -> String {
|
||
let max_len = 50;
|
||
if cell.len() > max_len {
|
||
// Find a valid UTF-8 boundary at or before max_len
|
||
let truncated = &cell[..cell.floor_char_boundary(max_len)];
|
||
format!("{truncated}...")
|
||
} else {
|
||
cell.to_string()
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
#[ignore]
|
||
fn test_table_detection_false_positives() {
|
||
if !test_documents_available() {
|
||
println!("Skipping: test_documents not available");
|
||
return;
|
||
}
|
||
|
||
let non_table_pdfs = vec![
|
||
"simple.pdf",
|
||
"fake_memo.pdf",
|
||
"google_doc_document.pdf",
|
||
"searchable.pdf",
|
||
];
|
||
|
||
println!("\n");
|
||
println!("╔════════════════════════════════════════════════════════════════╗");
|
||
println!("║ False Positive Analysis - Non-Table Documents ║");
|
||
println!("║ These documents should NOT have tables detected ║");
|
||
println!("╚════════════════════════════════════════════════════════════════╝");
|
||
println!();
|
||
|
||
let mut false_positives = 0;
|
||
let mut correct_negatives = 0;
|
||
|
||
for filename in non_table_pdfs {
|
||
let path = get_test_file_path(&format!("pdf/{}", filename));
|
||
|
||
if !path.exists() {
|
||
println!("[SKIP] {} - file not found", filename);
|
||
continue;
|
||
}
|
||
|
||
let config = ExtractionConfig {
|
||
ocr: Some(OcrConfig {
|
||
backend: "tesseract".to_string(),
|
||
language: "eng".to_string(),
|
||
..Default::default()
|
||
}),
|
||
force_ocr: false,
|
||
..Default::default()
|
||
};
|
||
|
||
match extract_file_sync(&path, None, &config) {
|
||
Ok(result) => {
|
||
if result.tables.is_empty() {
|
||
println!(" [CORRECT] {} - no tables detected", filename);
|
||
correct_negatives += 1;
|
||
} else {
|
||
println!(
|
||
" [FALSE POSITIVE] {} - detected {} tables (should have none)",
|
||
filename,
|
||
result.tables.len()
|
||
);
|
||
false_positives += 1;
|
||
|
||
for (idx, table) in result.tables.iter().enumerate() {
|
||
let rows = table.cells.len();
|
||
let cols = if rows > 0 { table.cells[0].len() } else { 0 };
|
||
println!(" Table {}: {} rows × {} cols", idx + 1, rows, cols);
|
||
|
||
if rows > 0 && cols > 0 {
|
||
let preview_rows = rows.min(2);
|
||
let preview_cols = cols.min(2);
|
||
for r in 0..preview_rows {
|
||
let mut row_str = String::from(" | ");
|
||
for c in 0..preview_cols {
|
||
let cell_content = table.cells[r].get(c).map(|s| s.as_str()).unwrap_or("");
|
||
row_str.push_str(&format!("{} | ", format_cell(cell_content)));
|
||
}
|
||
if preview_cols < cols {
|
||
row_str.push_str("... |");
|
||
}
|
||
println!("{}", row_str);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
Err(e) => {
|
||
println!(" [ERROR] {}: {}", filename, e);
|
||
}
|
||
}
|
||
}
|
||
|
||
println!();
|
||
println!("╔════════════════════════════════════════════════════════════════╗");
|
||
println!("║ False Positive Summary ║");
|
||
println!("╚════════════════════════════════════════════════════════════════╝");
|
||
println!();
|
||
println!("Correct negatives (no tables): {}", correct_negatives);
|
||
println!("False positives (detected): {}", false_positives);
|
||
if false_positives > 0 {
|
||
println!();
|
||
println!("WARNING: Detected {} false-positive tables!", false_positives);
|
||
println!("These should be investigated to improve detection accuracy.");
|
||
}
|
||
println!();
|
||
}
|
||
|
||
/// Focused test on specific PDFs known to have tables
|
||
#[test]
|
||
#[ignore]
|
||
fn test_table_detection_focus_on_table_documents() {
|
||
if !test_documents_available() {
|
||
println!("Skipping: test_documents not available");
|
||
return;
|
||
}
|
||
|
||
let table_pdfs = vec![
|
||
"embedded_images_tables.pdf",
|
||
"multi_page_tables.pdf",
|
||
"table_document.pdf",
|
||
"multi_page.pdf",
|
||
];
|
||
|
||
println!("\n");
|
||
println!("╔════════════════════════════════════════════════════════════════╗");
|
||
println!("║ Focused Table Detection on Known Table Documents ║");
|
||
println!("╚════════════════════════════════════════════════════════════════╝");
|
||
println!();
|
||
|
||
for filename in table_pdfs {
|
||
let path = get_test_file_path(&format!("pdf/{}", filename));
|
||
|
||
if !path.exists() {
|
||
println!("[SKIP] {} - file not found", filename);
|
||
continue;
|
||
}
|
||
|
||
println!("Analyzing: {}", filename);
|
||
println!();
|
||
|
||
let config = ExtractionConfig {
|
||
ocr: Some(OcrConfig {
|
||
backend: "tesseract".to_string(),
|
||
language: "eng".to_string(),
|
||
..Default::default()
|
||
}),
|
||
force_ocr: false,
|
||
..Default::default()
|
||
};
|
||
|
||
match extract_file_sync(&path, None, &config) {
|
||
Ok(result) => {
|
||
println!(" Tables detected: {}", result.tables.len());
|
||
|
||
if result.tables.is_empty() {
|
||
println!(" No tables detected - possible false negative");
|
||
}
|
||
|
||
for (idx, table) in result.tables.iter().enumerate() {
|
||
let rows = table.cells.len();
|
||
let cols = if rows > 0 { table.cells[0].len() } else { 0 };
|
||
|
||
println!();
|
||
println!(" Table {} (page {}):", idx + 1, table.page_number);
|
||
println!(" Dimensions: {} rows × {} cols", rows, cols);
|
||
println!(" Cell count: {}", rows * cols);
|
||
|
||
// Full preview (up to 10x10)
|
||
if rows > 0 && cols > 0 {
|
||
let preview_rows = rows.min(10);
|
||
let preview_cols = cols.min(10);
|
||
println!(" Full preview:");
|
||
for r in 0..preview_rows {
|
||
let mut row_str = String::from(" | ");
|
||
for c in 0..preview_cols {
|
||
let cell_content = table.cells[r].get(c).map(|s| s.as_str()).unwrap_or("");
|
||
row_str.push_str(&format!("{} | ", format_cell(cell_content)));
|
||
}
|
||
if preview_cols < cols {
|
||
row_str.push_str("... |");
|
||
}
|
||
println!("{}", row_str);
|
||
}
|
||
if preview_rows < rows {
|
||
println!(" | ... |");
|
||
}
|
||
}
|
||
|
||
println!();
|
||
println!(" Markdown:");
|
||
println!("{}", table.markdown);
|
||
println!();
|
||
}
|
||
}
|
||
Err(e) => {
|
||
println!(" ERROR: {}", e);
|
||
}
|
||
}
|
||
|
||
println!("─────────────────────────────────────────────────────────────────");
|
||
println!();
|
||
}
|
||
}
|