Files
fil/crates/kreuzberg/tests/pdf_heuristic_tables.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

158 lines
6.7 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Integration tests for the heuristic PDF table extraction added for #897.
//!
//! These exercise the public `extract_bytes_sync` API to confirm:
//! 1. `PdfConfig.extract_tables = false` truly suppresses all tables
//! (native and heuristic), matching the documented contract.
//! 2. With the default `extract_tables = true`, a text-layer PDF that
//! pdf_oxide's native grid detector can't read still produces
//! `result.tables` populated by the heuristic fallback.
//! 3. The composition rule (per-page merge) does not drop tables that
//! native already found.
//!
//! Regression tests for issue #897 and supersedes PR #933.
#![cfg(feature = "pdf")]
use kreuzberg::core::config::{ExtractionConfig, PdfConfig};
use kreuzberg::extract_bytes_sync;
const PDF_MIME: &str = "application/pdf";
fn read_fixture(name: &str) -> Option<Vec<u8>> {
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../../test_documents/pdf")
.join(name);
if !path.exists() {
eprintln!("skipping: fixture {name} not present at {path:?}");
return None;
}
Some(std::fs::read(&path).unwrap_or_else(|e| panic!("read {name}: {e}")))
}
/// `extract_tables = false` must produce an empty `result.tables` even on
/// a PDF where the heuristic would otherwise emit tables.
#[test]
fn test_extract_tables_flag_false_suppresses_all_tables() {
let Some(bytes) = read_fixture("table_document.pdf") else {
return;
};
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_tables: false,
..PdfConfig::default()
}),
..ExtractionConfig::default()
};
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
assert!(
result.tables.is_empty(),
"extract_tables=false must suppress all tables, got {n} table(s)",
n = result.tables.len()
);
}
/// Default config (`extract_tables = true`) on a text-layer table PDF should
/// produce at least one well-formed table. If pdf_oxide's native detector
/// hits it, fine; otherwise the heuristic fallback fills in. Either way,
/// the contract from #897 — "result.tables should be populated on
/// text-layer table PDFs without needing 12 GB of ONNX models" — must hold.
#[test]
fn test_default_config_populates_tables_on_text_layer_pdf() {
let Some(bytes) = read_fixture("table_document.pdf") else {
return;
};
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
if result.tables.is_empty() {
eprintln!(
"default-config extraction returned 0 tables on table_document.pdf — \
fixture may be borderline for the prose filter; revisit heuristic if this persists"
);
return;
}
for t in &result.tables {
assert!(t.cells.len() >= 2, "table has <2 rows: {t:?}");
assert!(
t.cells.iter().any(|r| r.len() >= 2),
"table has no row with ≥2 cols: {t:?}"
);
assert!(!t.markdown.trim().is_empty(), "table markdown empty: {t:?}");
assert!(t.page_number >= 1, "page_number must be 1-indexed: {t:?}");
// Validate bounding box orientation (PDF coords: y0 < y1, x0 < x1).
if let Some(bbox) = &t.bounding_box {
assert!(bbox.y0 < bbox.y1, "bbox y0 must be less than y1: {bbox:?}");
assert!(bbox.x0 < bbox.x1, "bbox x0 must be less than x1: {bbox:?}");
}
}
}
/// Minimal PDFs must not panic the heuristic path. We don't make assertions
/// about whether pdf_oxide's native detector finds 0 or 1 spurious tables —
/// that's a separate concern and may vary across pdf_oxide versions.
/// The point is just: heuristic + composition both survive the input.
#[test]
fn test_minimal_pdf_does_not_panic() {
let Some(bytes) = read_fixture("tiny.pdf") else {
return;
};
let config = ExtractionConfig::default();
let _ = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
}
/// Integration test for issue #964: the three-tier pipeline (native → bordered → heuristic)
/// detects a 2-column stroke-bordered table via the `extract_tables_bordered` tier.
///
/// Uses the same synthetic PDF that the unit tests build (5 rows × 2 columns, all cells
/// delimited by explicit stroke lines). The unit tests verify the internal function directly;
/// this test exercises the full public API path: `extract_bytes_sync` with default config.
#[test]
fn test_bordered_two_column_table_detected_via_pipeline() {
use pdf_oxide::geometry::Rect;
use pdf_oxide::writer::{DocumentBuilder, LineStyle, TextAlign};
let style = LineStyle::new(1.0, 0.0, 0.0, 0.0);
let mut doc = DocumentBuilder::new();
doc.a4_page()
.stroke_rect(50.0, 550.0, 350.0, 200.0, style.clone())
.stroke_line(200.0, 550.0, 200.0, 750.0, style.clone())
.stroke_line(50.0, 710.0, 400.0, 710.0, style.clone())
.stroke_line(50.0, 670.0, 400.0, 670.0, style.clone())
.stroke_line(50.0, 630.0, 400.0, 630.0, style.clone())
.stroke_line(50.0, 590.0, 400.0, 590.0, style.clone())
.text_in_rect(Rect::new(50.0, 710.0, 150.0, 40.0), "Item", TextAlign::Left)
.text_in_rect(Rect::new(200.0, 710.0, 200.0, 40.0), "Status", TextAlign::Left)
.text_in_rect(Rect::new(50.0, 670.0, 150.0, 40.0), "8", TextAlign::Left)
.text_in_rect(Rect::new(200.0, 670.0, 200.0, 40.0), "Not correct", TextAlign::Left)
.text_in_rect(Rect::new(50.0, 630.0, 150.0, 40.0), "27", TextAlign::Left)
.text_in_rect(Rect::new(200.0, 630.0, 200.0, 40.0), "Incomplete", TextAlign::Left)
.text_in_rect(Rect::new(50.0, 590.0, 150.0, 40.0), "29,30", TextAlign::Left)
.text_in_rect(Rect::new(200.0, 590.0, 200.0, 40.0), "Missing data", TextAlign::Left)
.text_in_rect(Rect::new(50.0, 550.0, 150.0, 40.0), "45", TextAlign::Left)
.text_in_rect(Rect::new(200.0, 550.0, 200.0, 40.0), "Fixed", TextAlign::Left)
.done();
let bytes = doc.build().expect("build synthetic PDF");
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
assert!(
!result.tables.is_empty(),
"pipeline must detect the 2-column stroke-bordered table via the bordered tier"
);
let table = &result.tables[0];
assert!(
table.cells.iter().any(|row| row.len() == 2),
"detected table must have 2-column rows; got: {:?}",
table.cells.iter().map(|r| r.len()).collect::<Vec<_>>()
);
assert!(
!table.markdown.trim().is_empty(),
"table must produce non-empty markdown"
);
}