Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/pdf_table_ground_truth.rs
+++ b/crates/kreuzberg/tests/pdf_table_ground_truth.rs
@@ -0,0 +1,406 @@
+//! Ground truth-based PDF table detection and markdown quality tests.
+//!
+//! These tests establish baselines for table detection and markdown output quality.
+//! Run after each substantial change to measure improvement or regression.
+//!
+//! Usage:
+//!   # Non-OCR tests (fast, oxide path):
+//!   cargo test -p kreuzberg --features "pdf" --test pdf_table_ground_truth -- --nocapture
+//!
+//!   # Full tests including table detection (needs ocr feature for HocrWord):
+//!   cargo test -p kreuzberg --features "pdf,ocr" --test pdf_table_ground_truth -- --nocapture
+//!
+//!   # Comprehensive baseline snapshot:
+//!   cargo test -p kreuzberg --features "pdf,ocr" --test pdf_table_ground_truth -- --ignored --nocapture
+
+#![cfg(feature = "pdf")]
+
+mod helpers;
+
+use helpers::*;
+use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
+use kreuzberg::extract_file_sync;
+
+/// Compute word-level Jaccard similarity between two strings.
+fn word_similarity(a: &str, b: &str) -> f64 {
+    let words_a: std::collections::HashSet<&str> = a.split_whitespace().collect();
+    let words_b: std::collections::HashSet<&str> = b.split_whitespace().collect();
+
+    if words_a.is_empty() && words_b.is_empty() {
+        return 1.0;
+    }
+    if words_a.is_empty() || words_b.is_empty() {
+        return 0.0;
+    }
+
+    let intersection = words_a.intersection(&words_b).count();
+    let union = words_a.union(&words_b).count();
+
+    intersection as f64 / union as f64
+}
+
+/// Extract markdown from a PDF file (oxide path, no OCR).
+fn extract_markdown(relative_path: &str) -> Option<kreuzberg::types::ExtractionResult> {
+    let path = get_test_file_path(relative_path);
+    if !path.exists() {
+        return None;
+    }
+
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        ..Default::default()
+    };
+
+    extract_file_sync(&path, None, &config).ok()
+}
+
+fn print_table_summary(result: &kreuzberg::types::ExtractionResult) {
+    println!("  Tables detected: {}", result.tables.len());
+    println!("  Content length: {} chars", result.content.len());
+    for (i, table) in result.tables.iter().enumerate() {
+        let rows = table.cells.len();
+        let cols = if rows > 0 { table.cells[0].len() } else { 0 };
+        println!("  Table {}: {}x{} (page {})", i + 1, rows, cols, table.page_number);
+        if let Some(first_row) = table.cells.first() {
+            let preview: Vec<String> = first_row
+                .iter()
+                .take(3)
+                .map(|c| {
+                    let s = c.trim();
+                    if s.len() > 40 {
+                        format!("{}...", &s[..s.floor_char_boundary(40)])
+                    } else {
+                        s.to_string()
+                    }
+                })
+                .collect();
+            println!("    First row: {:?}", preview);
+        }
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// Section 1: False Positive Regression Tests
+// Non-table PDFs must NOT have tables detected.
+// These are the hard gate — they must pass for a commit.
+// ═══════════════════════════════════════════════════════════════════
+
+/// Helper to run a false-positive check for a non-table PDF.
+/// Only checks when the ocr feature is enabled (table detection requires it).
+#[cfg(feature = "ocr")]
+fn assert_no_tables(pdf_name: &str) {
+    let rel = format!("pdf/{}", pdf_name);
+    if skip_if_missing(&rel) {
+        return;
+    }
+
+    let result = extract_markdown(&rel).expect("extraction should succeed");
+
+    println!("=== {} false positive check ===", pdf_name);
+    print_table_summary(&result);
+
+    assert!(
+        result.tables.is_empty(),
+        "{} should not have tables detected (got {})",
+        pdf_name,
+        result.tables.len()
+    );
+}
+
+#[cfg(feature = "ocr")]
+#[ignore = "TODO: pdf_oxide upstream — table detector false-positive driven by pdf_oxide span geometry; https://github.com/yfedoseev/pdf_oxide/issues/484"]
+#[test]
+fn test_false_positive_simple_pdf() {
+    assert_no_tables("simple.pdf");
+}
+
+#[cfg(feature = "ocr")]
+#[test]
+fn test_false_positive_fake_memo() {
+    assert_no_tables("fake_memo.pdf");
+}
+
+#[cfg(feature = "ocr")]
+#[test]
+fn test_false_positive_searchable() {
+    assert_no_tables("searchable.pdf");
+}
+
+#[cfg(feature = "ocr")]
+#[ignore = "TODO: pdf_oxide upstream — table detector false-positive driven by pdf_oxide span geometry; https://github.com/yfedoseev/pdf_oxide/issues/484"]
+#[test]
+fn test_false_positive_google_doc() {
+    assert_no_tables("google_doc_document.pdf");
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// Section 2: Markdown Quality Tests (oxide path, no OCR needed)
+// Tests that text-bearing PDFs produce reasonable markdown.
+// ═══════════════════════════════════════════════════════════════════
+
+#[test]
+fn test_markdown_quality_fake_memo() {
+    if skip_if_missing("pdf/fake_memo.pdf") {
+        return;
+    }
+
+    let result = extract_markdown("pdf/fake_memo.pdf").expect("extraction should succeed");
+
+    println!("=== fake_memo.pdf markdown quality ===");
+    println!("Content length: {} chars", result.content.len());
+
+    assert!(
+        result.content.len() > 100,
+        "fake_memo.pdf should produce >100 chars of markdown (got {})",
+        result.content.len()
+    );
+}
+
+#[test]
+fn test_markdown_quality_simple() {
+    if skip_if_missing("pdf/simple.pdf") {
+        return;
+    }
+
+    let result = extract_markdown("pdf/simple.pdf").expect("extraction should succeed");
+
+    println!("=== simple.pdf markdown quality ===");
+    println!("Content length: {} chars", result.content.len());
+
+    assert!(
+        result.content.len() > 1000,
+        "simple.pdf should produce >1000 chars of markdown (got {})",
+        result.content.len()
+    );
+}
+
+#[test]
+fn test_markdown_quality_multi_page() {
+    if skip_if_missing("pdf/multi_page.pdf") {
+        return;
+    }
+
+    let result = extract_markdown("pdf/multi_page.pdf").expect("extraction should succeed");
+
+    println!("=== multi_page.pdf markdown quality ===");
+    println!("Content length: {} chars", result.content.len());
+
+    // multi_page.pdf is text-based and should produce substantial markdown
+    assert!(
+        result.content.len() > 5000,
+        "multi_page.pdf should produce >5000 chars (got {})",
+        result.content.len()
+    );
+}
+
+#[test]
+fn test_markdown_quality_vs_ground_truth_simple() {
+    if skip_if_missing("pdf/table_document.pdf") {
+        return;
+    }
+
+    let gt_path = get_test_file_path("ground_truth/pdf/pdf_tables.txt");
+    if !gt_path.exists() {
+        println!("Skipping: ground truth file not found");
+        return;
+    }
+
+    let ground_truth = std::fs::read_to_string(&gt_path).expect("should read ground truth");
+    let result = extract_markdown("pdf/table_document.pdf").expect("extraction should succeed");
+
+    let similarity = word_similarity(&result.content, &ground_truth);
+
+    println!("=== table_document.pdf vs ground truth ===");
+    println!("Extraction length: {} chars", result.content.len());
+    println!("Ground truth length: {} chars", ground_truth.len());
+    println!("Word similarity: {:.1}%", similarity * 100.0);
+
+    // table_document.pdf is image-only, so the PDF extractor finds almost no text.
+    // This test tracks progress — similarity should increase with OCR improvements.
+    // Currently: ~3% (only image placeholder matches a few words).
+    println!("NOTE: table_document.pdf is image-only; low similarity expected without OCR.");
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// Section 3: OCR Path Table Detection (slow, run with --ignored)
+// ═══════════════════════════════════════════════════════════════════
+
+#[cfg(feature = "ocr")]
+#[test]
+#[ignore] // Slow OCR tests, run explicitly
+fn test_ocr_path_table_document() {
+    use kreuzberg::core::config::OcrConfig;
+
+    if skip_if_missing("pdf/table_document.pdf") {
+        return;
+    }
+
+    let path = get_test_file_path("pdf/table_document.pdf");
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: true, // Force OCR since this is image-only
+        ..Default::default()
+    };
+
+    let result = extract_file_sync(&path, None, &config).expect("extraction should succeed");
+
+    println!("=== table_document.pdf (forced OCR path) ===");
+    print_table_summary(&result);
+    println!("\n--- Content (first 2000 chars) ---");
+    println!("{}", &result.content[..result.content.len().min(2000)]);
+
+    assert!(
+        result.content.len() > 100,
+        "table_document.pdf OCR path should produce substantial content (got {})",
+        result.content.len()
+    );
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// Section 4: Comprehensive Baseline Snapshot
+// Full scan of all PDFs — run with --ignored for complete picture.
+// ═══════════════════════════════════════════════════════════════════
+
+#[test]
+#[ignore]
+fn test_comprehensive_table_detection_baseline() {
+    if !test_documents_available() {
+        println!("Skipping: test_documents not available");
+        return;
+    }
+
+    let image_table_pdfs = [
+        "table_document.pdf",
+        "multi_page_tables.pdf",
+        "embedded_images_tables.pdf",
+    ];
+
+    let text_table_pdfs = [
+        "multi_page.pdf",
+        "medium.pdf",
+        "large.pdf",
+        "a_comparison_of_programming_languages_in_economics_16_jun_2014.pdf",
+        "tiny.pdf",
+        "tatr.pdf",
+    ];
+
+    let non_table_pdfs = [
+        "simple.pdf",
+        "fake_memo.pdf",
+        "google_doc_document.pdf",
+        "searchable.pdf",
+        "test_article.pdf",
+        "code_and_formula.pdf",
+    ];
+
+    println!("\n╔══════════════════════════════════════════════════╗");
+    println!("║     Table Detection Baseline Snapshot             ║");
+    println!("╚══════════════════════════════════════════════════╝\n");
+
+    println!("--- Image-Only Table PDFs (need OCR) ---");
+    for pdf in &image_table_pdfs {
+        let rel = format!("pdf/{}", pdf);
+        if skip_if_missing(&rel) {
+            continue;
+        }
+        match extract_markdown(&rel) {
+            Some(result) => {
+                let table_count = result.tables.len();
+                let status = if result.content.len() < 50 { "IMG" } else { "OK" };
+                println!(
+                    "  [{:4}] {:<55} tables={} md_len={}",
+                    status,
+                    pdf,
+                    table_count,
+                    result.content.len()
+                );
+            }
+            None => println!("  [ERR ] {}", pdf),
+        }
+    }
+
+    println!("\n--- Text-Based PDFs Expected to Have Tables ---");
+    let mut true_positives = 0;
+    let mut false_negatives = 0;
+    for pdf in &text_table_pdfs {
+        let rel = format!("pdf/{}", pdf);
+        if skip_if_missing(&rel) {
+            continue;
+        }
+        match extract_markdown(&rel) {
+            Some(result) => {
+                let table_count = result.tables.len();
+                let status = if table_count > 0 {
+                    true_positives += 1;
+                    "OK"
+                } else {
+                    false_negatives += 1;
+                    "MISS"
+                };
+                println!(
+                    "  [{:4}] {:<55} tables={} md_len={}",
+                    status,
+                    pdf,
+                    table_count,
+                    result.content.len()
+                );
+            }
+            None => println!("  [ERR ] {}", pdf),
+        }
+    }
+
+    println!("\n--- Expected Non-Table PDFs ---");
+    let mut true_negatives = 0;
+    let mut false_positives = 0;
+    for pdf in &non_table_pdfs {
+        let rel = format!("pdf/{}", pdf);
+        if skip_if_missing(&rel) {
+            continue;
+        }
+        match extract_markdown(&rel) {
+            Some(result) => {
+                let table_count = result.tables.len();
+                let status = if table_count == 0 {
+                    true_negatives += 1;
+                    "OK"
+                } else {
+                    false_positives += 1;
+                    "FP"
+                };
+                println!(
+                    "  [{:4}] {:<55} tables={} md_len={}",
+                    status,
+                    pdf,
+                    table_count,
+                    result.content.len()
+                );
+            }
+            None => println!("  [ERR ] {}", pdf),
+        }
+    }
+
+    println!("\n--- Summary ---");
+    println!("True positives:  {}", true_positives);
+    println!("False negatives: {}", false_negatives);
+    println!("True negatives:  {}", true_negatives);
+    println!("False positives: {}", false_positives);
+
+    let precision = if true_positives + false_positives > 0 {
+        true_positives as f64 / (true_positives + false_positives) as f64
+    } else {
+        0.0
+    };
+    let recall = if true_positives + false_negatives > 0 {
+        true_positives as f64 / (true_positives + false_negatives) as f64
+    } else {
+        0.0
+    };
+
+    println!("Precision: {:.1}%", precision * 100.0);
+    println!("Recall:    {:.1}%", recall * 100.0);
+}