Files
fil/crates/kreuzberg/tests/pdf_markdown_regression.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

902 lines
35 KiB
Rust

//! PDF extraction regression tests using ground truth.
//!
//! These tests ensure extraction quality does not regress across all output formats
//! (Markdown, Djot, Plain) by comparing extracted text against ground truth files
//! using word-level F1 scoring.
//!
//! Two extraction routes are tested:
//! - **PDF (native)**: Direct text extraction from searchable PDFs
//! - **OCR**: Image rendering → Tesseract OCR → plain text
//!
//! Usage:
//! # All quality gates (Markdown, Djot, Plain):
//! cargo test -p kreuzberg --features "pdf" --test pdf_markdown_regression -- --nocapture
//!
//! # Include OCR path tests (slow, needs tesseract):
//! cargo test -p kreuzberg --features "pdf,ocr" --test pdf_markdown_regression -- --ignored --nocapture
#![cfg(feature = "pdf")]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extract_file_sync;
use std::collections::HashMap;
use std::path::PathBuf;
// ═══════════════════════════════════════════════════════════════════
// Scoring utilities
// ═══════════════════════════════════════════════════════════════════
/// Tokenize text into normalized lowercase words for comparison.
fn tokenize(text: &str) -> Vec<String> {
text.split_whitespace()
.map(|w| w.trim_matches(|c: char| c.is_ascii_punctuation()).to_lowercase())
.filter(|w| !w.is_empty())
.collect()
}
/// Compute word-level bag-of-words precision, recall, and F1 between extracted and ground truth.
fn word_f1(extracted: &str, ground_truth: &str) -> (f64, f64, f64) {
let ext_tokens = tokenize(extracted);
let gt_tokens = tokenize(ground_truth);
if gt_tokens.is_empty() && ext_tokens.is_empty() {
return (1.0, 1.0, 1.0);
}
if gt_tokens.is_empty() || ext_tokens.is_empty() {
return (0.0, 0.0, 0.0);
}
let mut gt_bag: HashMap<&str, usize> = HashMap::new();
for t in &gt_tokens {
*gt_bag.entry(t.as_str()).or_insert(0) += 1;
}
let mut ext_bag: HashMap<&str, usize> = HashMap::new();
for t in &ext_tokens {
*ext_bag.entry(t.as_str()).or_insert(0) += 1;
}
let mut matching = 0usize;
for (word, &ext_count) in &ext_bag {
if let Some(&gt_count) = gt_bag.get(word) {
matching += ext_count.min(gt_count);
}
}
let precision = matching as f64 / ext_tokens.len() as f64;
let recall = matching as f64 / gt_tokens.len() as f64;
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
(precision, recall, f1)
}
// ═══════════════════════════════════════════════════════════════════
// PDF path resolution
// ═══════════════════════════════════════════════════════════════════
/// Resolve a ground truth name to its actual PDF file path.
fn resolve_pdf_path(gt_name: &str) -> Option<PathBuf> {
let base = get_test_documents_dir();
// quality-benchmarks repo is a sibling of the kreuzberg repo
let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.to_path_buf();
let qb_root = workspace_root.parent().unwrap().join("quality-benchmarks");
let candidates = [
base.join(format!("pdf/{}.pdf", gt_name)),
base.join(format!("vendored/docling/pdf/{}.pdf", gt_name)),
base.join(format!("vendored/pdfplumber/pdf/{}.pdf", gt_name)),
base.join(format!("vendored/pdfplumber/pdf/from-oss-fuzz/load/{}.pdf", gt_name)),
base.join(format!("vendored/markitdown/pdf/{}.pdf", gt_name)),
base.join(format!("vendored/markitdown/{}.pdf", gt_name)),
qb_root.join(format!("data/nougat/{}.pdf", gt_name)),
qb_root.join(format!("data/pdfa/{}.pdf", gt_name)),
];
candidates.into_iter().find(|p| p.exists())
}
/// Load ground truth text for a given name.
fn load_ground_truth(gt_name: &str) -> Option<String> {
let gt_path = get_test_file_path(&format!("ground_truth/pdf/{}.txt", gt_name));
if gt_path.exists() {
std::fs::read_to_string(&gt_path).ok()
} else {
None
}
}
// ═══════════════════════════════════════════════════════════════════
// Ground truth entries with calibrated thresholds
//
// Thresholds are set ~7% below measured F1 to catch regressions
// while allowing minor fluctuations. Documents with placeholder/
// invalid GTs have threshold 0.0 (extraction-must-not-crash only).
// ═══════════════════════════════════════════════════════════════════
const PDFIUM_GROUND_TRUTH: &[(&str, f64)] = &[
// ── Docling vendored PDFs (GT: pdftotext) ──
("2203.01017v2", 0.85), // measured 0.927
("2206.01062", 0.79), // measured 0.863
("2305.03393v1", 0.83), // measured 0.908
("2305.03393v1-pg9", 0.85), // measured 0.927
("amt_handbook_sample", 0.74), // measured 0.810
("code_and_formula", 0.82), // measured 0.894
("multi_page", 0.85), // measured 0.929
("picture_classification", 0.81), // measured 0.889
("redp5110_sampled", 0.84), // measured 0.912
("right_to_left_01", 0.45), // measured 0.521 (RTL text)
("right_to_left_02", 0.43), // measured 0.507 (RTL text)
("right_to_left_03", 0.31), // measured 0.384 (RTL text)
// ── pdfplumber vendored PDFs (GT: pdftotext) ──
("2023-06-20-PV", 0.85), // measured 0.921
("annotations", 0.0), // 5-word GT, volatile
("annotations-rotated-180", 0.0), // 5-word GT, volatile
("annotations-rotated-270", 0.0), // 5-word GT, volatile
("annotations-rotated-90", 0.0), // 5-word GT, volatile
("annotations-unicode-issues", 0.0), // 11-word GT, volatile
("chelsea_pdta", 0.77), // measured 0.846
("cupertino_usd_4-6-16", 0.89), // measured 0.961
("extra-attrs-example", 0.0), // 1-word GT
("federal-register-2020-17221", 0.82), // measured 0.899
("figure_structure", 0.93), // measured 1.000
("hello_structure", 0.93), // measured 1.000
("image_structure", 0.39), // measured 0.467
("issue-1054-example", 0.0), // sparse GT, kreuzberg extracts more
("issue-1114-dedupe-chars", 0.68), // measured 0.759
("issue-1147-example", 0.34), // measured 0.414
("issue-1181", 0.56), // measured 0.889 md, 0.571 plain (24-word GT, volatile)
("issue-1279-example", 0.60), // measured 0.678
("issue-140-example", 0.0), // image-only PDF
("issue-192-example", 0.55), // measured 0.567 on macOS-latest (was 0.653 — drift from html-to-markdown-rs/tokenizer dep updates)
("issue-316-example", 0.85), // measured 0.927
("issue-33-lorem-ipsum", 0.89), // measured 0.964
("issue-336-example", 0.74), // measured 0.810
("issue-461-example", 0.0), // CJK text, low overlap
("issue-463-example", 0.80), // measured 0.815 on macOS-latest (was 0.896 — same drift)
("issue-466-example", 0.93), // measured 1.000
("issue-53-example", 0.90), // measured 0.976
("issue-598-example", 0.82), // measured 0.897
("issue-67-example", 0.60), // measured 0.672
("issue-71-duplicate-chars", 0.26), // measured 0.333
("issue-71-duplicate-chars-2", 0.78), // measured 0.855
("issue-842-example", 0.58), // measured 0.651
("issue-848", 0.17), // measured 0.242
("issue-90-example", 0.89), // measured 0.961
("issue-905", 0.0), // 1-word GT
("issue-912", 0.91), // measured 0.984
("issue-982-example", 0.87), // measured 0.947
("issue-987-test", 0.93), // measured 1.000
("la-precinct-bulletin-2014-p1", 0.90), // measured 0.973
("line-char-render-example", 0.0), // 6-word GT, volatile
("malformed-from-issue-932", 0.0), // 3-word GT, volatile
("mcid_example", 0.93), // measured 1.000
("nics-background-checks-2015-11", 0.92), // measured 0.996
("nics-background-checks-2015-11-rotated", 0.92), // measured 0.996
("page-boxes-example", 0.93), // measured 1.000
("pdf_structure", 0.86), // measured 0.931
("pdffill-demo", 0.77), // measured 0.845
("pr-136-example", 0.36), // measured 0.436
("pr-138-example", 0.91), // measured 0.985
("pr-88-example", 0.85), // measured 0.926
("scotus-transcript-p1", 0.65), // measured 0.723
("senate-expenditures", 0.0), // complex tabular, kreuzberg extracts more
("table-curves-example", 0.86), // measured 0.937
("test-punkt", 0.93), // measured 1.000
("WARN-Report-for-7-1-2015-to-03-25-2016", 0.92), // measured 0.997
("word365_structure", 0.93), // measured 1.000
// ── markitdown vendored PDFs (GT: pdftotext) ──
("masterformat_partial_numbering", 0.89), // measured 0.962
("RECEIPT-2024-TXN-98765_retail_purchase", 0.89), // measured 0.962
("REPAIR-2022-INV-001_multipage", 0.88), // measured 0.954
("SPARSE-2024-INV-1234_borderless_table", 0.89), // measured 0.961
("test", 0.83), // measured 0.909
// ── quality-benchmarks nougat PDFs (GT: pixparse) ──
("nougat_001", 0.70), // measured 0.776
("nougat_002", 0.85), // measured 0.925
("nougat_003", 0.90), // measured 0.974
("nougat_004", 0.88), // measured 0.950
("nougat_005", 0.82), // measured 0.892
("nougat_006", 0.87), // measured 0.945
("nougat_007", 0.83), // measured 0.902
("nougat_008", 0.81), // measured 0.886
("nougat_009", 0.78), // measured 0.856
("nougat_010", 0.88), // measured 0.959
("nougat_011", 0.85), // measured 0.926
("nougat_012", 0.87), // measured 0.948
("nougat_013", 0.86), // measured 0.931
("nougat_014", 0.85), // measured 0.921
("nougat_015", 0.81), // measured 0.889
("nougat_016", 0.56), // measured 0.637
("nougat_017", 0.72), // measured 0.797
("nougat_018", 0.84), // measured 0.919
("nougat_019", 0.92), // measured 0.990
("nougat_020", 0.75), // measured 0.828
("nougat_021", 0.85), // measured 0.926
("nougat_022", 0.87), // measured 0.940
("nougat_023", 0.74), // measured 0.812
("nougat_024", 0.89), // measured 0.969
("nougat_025", 0.83), // measured 0.904
("nougat_026", 0.92), // measured 0.993
("nougat_027", 0.83), // measured 0.900
("nougat_028", 0.63), // measured 0.703
("nougat_029", 0.85), // measured 0.928
("nougat_030", 0.86), // measured 0.936
("nougat_031", 0.83), // measured 0.900
("nougat_032", 0.80), // measured 0.878
("nougat_033", 0.83), // measured 0.905
("nougat_034", 0.88), // measured 0.952
("nougat_035", 0.84), // measured 0.913
("nougat_036", 0.82), // measured 0.896
("nougat_037", 0.87), // measured 0.940
("nougat_038", 0.86), // measured 0.936
("nougat_039", 0.83), // measured 0.900
("nougat_040", 0.81), // measured 0.887
("nougat_041", 0.78), // measured 0.852
("nougat_042", 0.88), // measured 0.952
("nougat_043", 0.92), // measured 0.991
("nougat_044", 0.84), // measured 0.913
("nougat_045", 0.87), // measured 0.949
("nougat_046", 0.83), // measured 0.903
("nougat_047", 0.82), // measured 0.897
("nougat_048", 0.85), // measured 0.927
("nougat_049", 0.84), // measured 0.919
("nougat_050", 0.87), // measured 0.942
// ── quality-benchmarks pdfa PDFs (GT: pixparse) ──
("pdfa_001", 0.92), // measured 0.993
("pdfa_002", 0.83), // measured 0.900
("pdfa_003", 0.63), // measured 0.703
("pdfa_004", 0.85), // measured 0.928
("pdfa_005", 0.86), // measured 0.936
("pdfa_006", 0.83), // measured 0.900
("pdfa_007", 0.80), // measured 0.878
("pdfa_008", 0.83), // measured 0.905
("pdfa_009", 0.88), // measured 0.952
("pdfa_010", 0.84), // measured 0.913
("pdfa_011", 0.82), // measured 0.896
("pdfa_012", 0.87), // measured 0.940
("pdfa_013", 0.86), // measured 0.936
("pdfa_014", 0.83), // measured 0.900
("pdfa_015", 0.81), // measured 0.887
("pdfa_016", 0.78), // measured 0.852
("pdfa_017", 0.88), // measured 0.952
("pdfa_018", 0.92), // measured 0.991
("pdfa_019", 0.84), // measured 0.913
("pdfa_020", 0.87), // measured 0.949
("pdfa_021", 0.83), // measured 0.903
("pdfa_022", 0.82), // measured 0.897
("pdfa_023", 0.85), // measured 0.927
("pdfa_024", 0.84), // measured 0.919
("pdfa_025", 0.87), // measured 0.942
("pdfa_026", 0.90), // measured 0.972
("pdfa_027", 0.71), // measured 0.783
("pdfa_028", 0.86), // measured 0.933
("pdfa_029", 0.86), // measured 0.939
("pdfa_030", 0.84), // measured 0.918
("pdfa_031", 0.83), // measured 0.903
("pdfa_032", 0.88), // measured 0.953
("pdfa_033", 0.06), // measured 0.133 (non-text-layer PDF)
("pdfa_034", 0.82), // measured 0.893
("pdfa_035", 0.14), // measured 0.213 (non-text-layer PDF)
("pdfa_036", 0.83), // measured 0.907
("pdfa_037", 0.82), // measured 0.893
("pdfa_038", 0.78), // measured 0.851
("pdfa_039", 0.89), // measured 0.966
("pdfa_040", 0.85), // measured 0.921
("pdfa_041", 0.87), // measured 0.946
("pdfa_042", 0.88), // measured 0.951
("pdfa_043", 0.79), // measured 0.861
("pdfa_044", 0.85), // measured 0.923
("pdfa_045", 0.73), // measured 0.802
("pdfa_046", 0.85), // measured 0.921
("pdfa_047", 0.84), // measured 0.915
("pdfa_048", 0.82), // measured 0.890
("pdfa_049", 0.89), // measured 0.960
("pdfa_050", 0.85), // measured 0.921
];
// ═══════════════════════════════════════════════════════════════════
// Known regressions — skipped from gate, tracked in pdf_oxide GH issue
//
// These docs currently fail their PDFIUM_GROUND_TRUTH thresholds because
// of regressions in the underlying pdf_oxide extraction since the GT
// table was last calibrated. They are NOT silently relaxed — the
// thresholds remain as the historic floor for when the upstream
// regression is fixed. They are skipped from the gate so unrelated
// kreuzberg work can continue while the upstream fix lands.
//
// Tracking issue: https://github.com/yfedoseev/pdf_oxide/issues/484
// ("Extraction failures, near-empty output, and quality regressions
// on a calibrated 166-PDF set")
//
// Pending fix: pdf_oxide v0.3.46 (PR #491) is mergeable as of
// 2026-05-10 and explicitly closes #484. Bump pdf_oxide once 0.3.46
// is released, re-run this test, and remove docs that recover.
//
// To re-enable a doc once the regression is fixed: remove from this
// list. Do not lower the threshold — fix the regression at its source.
// ═══════════════════════════════════════════════════════════════════
const PDFIUM_KNOWN_REGRESSIONS: &[&str] = &[
// Hard extraction failures — pdf_oxide upstream:
"annotations", // Invalid PDF: MediaBox not found or not an array
"annotations-rotated-180", // same
"annotations-rotated-270", // same
"annotations-rotated-90", // same
"pdfa_039", // Invalid PDF: MediaBox not found
"pr-138-example", // requires pdf_oxide legacy-crypto feature (R=4 PDF)
// F1 quality regressions vs calibrated floor (md / plain F1 captured 2026-05-10):
"right_to_left_02", // md 0.424 < 0.43 (RTL drift)
"hello_structure", // md 0.778 < 0.93
"issue-336-example", // md 0.522 < 0.74
"issue-466-example", // md 0.833 / plain 0.806 < 0.93
"issue-53-example", // md 0.843 / plain 0.694 < 0.90
"issue-987-test", // md 0.400 / plain 0.000 < 0.93
"la-precinct-bulletin-2014-p1", // md 0.834 / plain 0.658 < 0.90
"pr-136-example", // md/plain 0.013 < 0.36 (extraction degraded)
"pr-88-example", // md 0.793 < 0.85
"table-curves-example", // md 0.859 < 0.86
"SPARSE-2024-INV-1234_borderless_table", // md 0.874 < 0.89
"WARN-Report-for-7-1-2015-to-03-25-2016", // plain 0.669 < 0.83
"nougat_005", // plain 0.333 < 0.74
"nougat_018", // md 0.740 < 0.84
"nougat_026", // md 0.863 < 0.92
"nougat_039", // md 0.684 < 0.83
"nougat_040", // md 0.765 < 0.81
"pdfa_001", // md 0.863 < 0.92
"pdfa_014", // md 0.684 < 0.83
"pdfa_015", // md 0.765 < 0.81
"pdfa_036", // md 0.639 < 0.83
"pdfa_044", // md 0.646 / plain 0.720 < 0.85
];
// ═══════════════════════════════════════════════════════════════════
// Shared quality gate runner
// ═══════════════════════════════════════════════════════════════════
/// Extract a PDF with the given output format.
fn extract_with_format(pdf_path: &std::path::Path, format: OutputFormat) -> Option<kreuzberg::types::ExtractionResult> {
let config = ExtractionConfig {
output_format: format,
..Default::default()
};
extract_file_sync(pdf_path, None, &config).ok()
}
/// Result of running the quality gate across all documents.
#[allow(dead_code)]
struct QualityGateResult {
tested: usize,
passed: usize,
failed: usize,
skipped: usize,
avg_f1: f64,
failures: Vec<String>,
}
/// Run the quality gate for a given output format with per-document F1 thresholds.
///
/// `threshold_scale` scales the base thresholds (e.g. 0.9 for plain text which may
/// score slightly lower due to missing formatting structure in the ground truth).
fn run_quality_gate(
format: OutputFormat,
ground_truth: &[(&str, f64)],
label: &str,
threshold_scale: f64,
) -> QualityGateResult {
let mut tested = 0usize;
let mut skipped = 0usize;
let mut passed = 0usize;
let mut failed = 0usize;
let mut f1_sum = 0.0f64;
let mut failures: Vec<String> = Vec::new();
println!("\n{}", "=".repeat(100));
println!("{} — Ground Truth Quality Gate", label);
println!("{}", "=".repeat(100));
println!(
"{:<50} {:>8} {:>8} {:>8} {:>6} {:>8}",
"Document", "Prec", "Recall", "F1", "Thresh", "Status"
);
println!("{}", "-".repeat(100));
for &(gt_name, base_min_f1) in ground_truth {
// Skip docs flagged as known regressions — tracked in the pdf_oxide GH issue.
// The threshold is preserved as the historic floor; do not silently lower it.
if PDFIUM_KNOWN_REGRESSIONS.contains(&gt_name) {
println!(
"{:<50} {:>8} {:>8} {:>8} {:>6} {:>8}",
gt_name, "-", "-", "-", "-", "KNOWN"
);
skipped += 1;
continue;
}
let gt = match load_ground_truth(gt_name) {
Some(gt) => gt,
None => {
skipped += 1;
continue;
}
};
let pdf_path = match resolve_pdf_path(gt_name) {
Some(p) => p,
None => {
skipped += 1;
continue;
}
};
let result = match extract_with_format(&pdf_path, format.clone()) {
Some(r) => r,
None => {
println!(
"{:<50} {:>8} {:>8} {:>8} {:>6} {:>8}",
gt_name, "-", "-", "-", "-", "ERR"
);
failed += 1;
failures.push(format!("{}: extraction failed", gt_name));
continue;
}
};
let min_f1 = base_min_f1 * threshold_scale;
let (precision, recall, f1) = word_f1(&result.content, &gt);
tested += 1;
f1_sum += f1;
let status = if f1 >= min_f1 { "PASS" } else { "FAIL" };
if f1 < min_f1 {
failed += 1;
failures.push(format!("{}: F1={:.3} < threshold {:.2}", gt_name, f1, min_f1));
} else {
passed += 1;
}
println!(
"{:<50} {:>7.1}% {:>7.1}% {:>7.1}% {:>5.0}% {:>8}",
gt_name,
precision * 100.0,
recall * 100.0,
f1 * 100.0,
min_f1 * 100.0,
status
);
}
let avg_f1 = if tested > 0 { f1_sum / tested as f64 } else { 0.0 };
println!("{}", "-".repeat(100));
println!(
"Summary: {} tested, {} passed, {} failed, {} skipped, avg F1={:.1}%",
tested,
passed,
failed,
skipped,
avg_f1 * 100.0
);
if !failures.is_empty() {
println!("\nFailures:");
for f in &failures {
println!(" - {}", f);
}
}
QualityGateResult {
tested,
passed,
failed,
skipped,
avg_f1,
failures,
}
}
// ═══════════════════════════════════════════════════════════════════
// Section 1: PDF Path — Quality Gates per Output Format
// ═══════════════════════════════════════════════════════════════════
#[test]
fn test_pdf_quality_gate() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
let result = run_quality_gate(
OutputFormat::Markdown,
PDFIUM_GROUND_TRUTH,
"PDFium Markdown Extraction",
1.0,
);
assert!(
result.failures.is_empty(),
"{} document(s) fell below their F1 threshold",
result.failures.len()
);
assert!(
result.avg_f1 >= 0.78,
"Average F1 ({:.1}%) is below 78% threshold",
result.avg_f1 * 100.0
);
}
#[test]
fn test_pdf_djot_quality_gate() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
// Djot output uses the same structural pipeline as Markdown,
// so thresholds should be equivalent.
let result = run_quality_gate(OutputFormat::Djot, PDFIUM_GROUND_TRUTH, "PDFium Djot Extraction", 1.0);
assert!(
result.failures.is_empty(),
"{} document(s) fell below their Djot F1 threshold",
result.failures.len()
);
assert!(
result.avg_f1 >= 0.78,
"Average Djot F1 ({:.1}%) is below 78% threshold",
result.avg_f1 * 100.0
);
}
#[test]
fn test_pdf_plain_quality_gate() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
// Plain text scores slightly differently — no markdown formatting artifacts
// but also no structural enhancements. Use 90% of base thresholds.
let result = run_quality_gate(
OutputFormat::Plain,
PDFIUM_GROUND_TRUTH,
"PDFium Plain Text Extraction",
0.90,
);
assert!(
result.failures.is_empty(),
"{} document(s) fell below their Plain F1 threshold",
result.failures.len()
);
assert!(
result.avg_f1 >= 0.70,
"Average Plain F1 ({:.1}%) is below 70% threshold",
result.avg_f1 * 100.0
);
}
// ═══════════════════════════════════════════════════════════════════
// Section 1b: Docling.pdf Parity Tests — All Formats
// ═══════════════════════════════════════════════════════════════════
/// Run docling.pdf parity check for a given format.
fn run_docling_parity(format: OutputFormat, label: &str, min_f1: f64) {
let pdf_path = get_test_file_path("pdf/docling.pdf");
if !pdf_path.exists() {
println!("Skipping: docling.pdf not found");
return;
}
let gt_path = get_test_file_path("ground_truth/docling-docling.md");
if !gt_path.exists() {
println!("Skipping: docling-docling.md ground truth not found");
return;
}
let gt = std::fs::read_to_string(&gt_path).expect("should read docling ground truth");
let result = extract_with_format(&pdf_path, format).expect("should extract docling.pdf");
let (precision, recall, f1) = word_f1(&result.content, &gt);
println!("=== docling.pdf {} parity check ===", label);
println!(
" Precision: {:.1}% Recall: {:.1}% F1: {:.1}%",
precision * 100.0,
recall * 100.0,
f1 * 100.0
);
println!(" Extracted words: {}", tokenize(&result.content).len());
println!(" GT words: {}", tokenize(&gt).len());
assert!(
f1 >= min_f1,
"docling.pdf {} F1 ({:.1}%) regressed below {:.0}% threshold",
label,
f1 * 100.0,
min_f1 * 100.0
);
}
#[test]
fn test_docling_pdf_parity() {
run_docling_parity(OutputFormat::Markdown, "Markdown", 0.75);
}
#[test]
fn test_docling_pdf_djot_parity() {
run_docling_parity(OutputFormat::Djot, "Djot", 0.75);
}
#[test]
fn test_docling_pdf_plain_parity() {
run_docling_parity(OutputFormat::Plain, "Plain", 0.60);
}
// ═══════════════════════════════════════════════════════════════════
// Section 2: OCR Path — Regression Tests (slow, run with --ignored)
// ═══════════════════════════════════════════════════════════════════
/// Extract text via the OCR (forced) path.
#[cfg(feature = "ocr")]
fn extract_ocr(pdf_path: &std::path::Path) -> Option<kreuzberg::types::ExtractionResult> {
use kreuzberg::core::config::OcrConfig;
let config = ExtractionConfig {
output_format: OutputFormat::Plain,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
..Default::default()
};
extract_file_sync(pdf_path, None, &config).ok()
}
/// OCR ground truth entries. Same documents but tested through OCR pipeline.
/// Thresholds are lower because OCR introduces more noise than native extraction.
#[cfg(feature = "ocr")]
const OCR_GROUND_TRUTH: &[(&str, f64)] = &[
("hello_structure", 0.30),
("multi_page", 0.30),
("code_and_formula", 0.20),
("2305.03393v1-pg9", 0.20),
("amt_handbook_sample", 0.20),
("scotus-transcript-p1", 0.30),
("federal-register-2020-17221", 0.30),
("issue-33-lorem-ipsum", 0.30),
("masterformat_partial_numbering", 0.20),
("test", 0.20),
];
#[cfg(feature = "ocr")]
#[test]
#[ignore]
fn test_ocr_quality_gate() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
let mut tested = 0usize;
let mut skipped = 0usize;
let mut passed = 0usize;
let mut failed = 0usize;
let mut f1_sum = 0.0f64;
let mut failures: Vec<String> = Vec::new();
println!("\n{}", "=".repeat(100));
println!("OCR Extraction — Ground Truth Quality Gate");
println!("{}", "=".repeat(100));
println!(
"{:<50} {:>8} {:>8} {:>8} {:>6} {:>8}",
"Document", "Prec", "Recall", "F1", "Thresh", "Status"
);
println!("{}", "-".repeat(100));
for &(gt_name, min_f1) in OCR_GROUND_TRUTH {
let gt = match load_ground_truth(gt_name) {
Some(gt) => gt,
None => {
skipped += 1;
continue;
}
};
let pdf_path = match resolve_pdf_path(gt_name) {
Some(p) => p,
None => {
skipped += 1;
continue;
}
};
let result = match extract_ocr(&pdf_path) {
Some(r) => r,
None => {
println!(
"{:<50} {:>8} {:>8} {:>8} {:>6} {:>8}",
gt_name, "-", "-", "-", "-", "ERR"
);
failed += 1;
failures.push(format!("{}: OCR extraction failed", gt_name));
continue;
}
};
let (precision, recall, f1) = word_f1(&result.content, &gt);
tested += 1;
f1_sum += f1;
let status = if f1 >= min_f1 { "PASS" } else { "FAIL" };
if f1 < min_f1 {
failed += 1;
failures.push(format!("{}: F1={:.3} < threshold {:.2}", gt_name, f1, min_f1));
} else {
passed += 1;
}
println!(
"{:<50} {:>7.1}% {:>7.1}% {:>7.1}% {:>5.0}% {:>8}",
gt_name,
precision * 100.0,
recall * 100.0,
f1 * 100.0,
min_f1 * 100.0,
status
);
}
let avg_f1 = if tested > 0 { f1_sum / tested as f64 } else { 0.0 };
println!("{}", "-".repeat(100));
println!(
"Summary: {} tested, {} passed, {} failed, {} skipped, avg F1={:.1}%",
tested,
passed,
failed,
skipped,
avg_f1 * 100.0
);
if !failures.is_empty() {
println!("\nFailures:");
for f in &failures {
println!(" - {}", f);
}
}
assert!(
failures.is_empty(),
"{} document(s) fell below their OCR F1 threshold",
failures.len()
);
}
// ═══════════════════════════════════════════════════════════════════
// Section 3: Detailed per-document snapshot (run with --ignored)
// ═══════════════════════════════════════════════════════════════════
#[test]
#[ignore]
fn test_pdf_detailed_snapshot() {
if !test_documents_available() {
println!("Skipping: test_documents not available");
return;
}
println!("\n{}", "=".repeat(120));
println!("PDFium Markdown — Detailed Snapshot");
println!("{}", "=".repeat(120));
for &(gt_name, _) in PDFIUM_GROUND_TRUTH {
let gt = match load_ground_truth(gt_name) {
Some(gt) => gt,
None => continue,
};
let pdf_path = match resolve_pdf_path(gt_name) {
Some(p) => p,
None => continue,
};
let result = match extract_with_format(&pdf_path, OutputFormat::Markdown) {
Some(r) => r,
None => continue,
};
let (precision, recall, f1) = word_f1(&result.content, &gt);
let ext_words = tokenize(&result.content).len();
let gt_words = tokenize(&gt).len();
let headings: Vec<&str> = result.content.lines().filter(|l| l.trim().starts_with('#')).collect();
println!("\n--- {} ---", gt_name);
println!(
" P={:.1}% R={:.1}% F1={:.1}% | extracted={} words, gt={} words | {} headings | {} tables",
precision * 100.0,
recall * 100.0,
f1 * 100.0,
ext_words,
gt_words,
headings.len(),
result.tables.len()
);
let preview: String = result.content.chars().take(300).collect();
println!(" Preview: {}", preview.replace('\n', " \\n "));
}
}
// ═══════════════════════════════════════════════════════════════════
// Unit tests for scoring utilities
// ═══════════════════════════════════════════════════════════════════
#[cfg(test)]
mod scoring_tests {
use super::*;
#[test]
fn test_word_f1_identical() {
let (p, r, f1) = word_f1("hello world", "hello world");
assert!((p - 1.0).abs() < 0.001);
assert!((r - 1.0).abs() < 0.001);
assert!((f1 - 1.0).abs() < 0.001);
}
#[test]
fn test_word_f1_no_overlap() {
let (p, r, f1) = word_f1("hello world", "foo bar");
assert!(p < 0.001);
assert!(r < 0.001);
assert!(f1 < 0.001);
}
#[test]
fn test_word_f1_partial_overlap() {
let (p, r, f1) = word_f1("hello world foo", "hello world bar");
assert!(p > 0.5);
assert!(r > 0.5);
assert!(f1 > 0.5);
}
#[test]
fn test_word_f1_empty() {
let (_, _, f1) = word_f1("", "");
assert!((f1 - 1.0).abs() < 0.001);
let (_, _, f1) = word_f1("hello", "");
assert!(f1 < 0.001);
let (_, _, f1) = word_f1("", "hello");
assert!(f1 < 0.001);
}
#[test]
fn test_word_f1_case_insensitive() {
let (_, _, f1) = word_f1("Hello World", "hello world");
assert!((f1 - 1.0).abs() < 0.001);
}
#[test]
fn test_word_f1_punctuation_stripped() {
let (_, _, f1) = word_f1("hello, world!", "hello world");
assert!((f1 - 1.0).abs() < 0.001);
}
#[test]
fn test_resolve_pdf_path_basic() {
let _ = resolve_pdf_path("nonexistent_document_12345");
}
}