158 lines
6.7 KiB
Rust
158 lines
6.7 KiB
Rust
//! Integration tests for the heuristic PDF table extraction added for #897.
|
||
//!
|
||
//! These exercise the public `extract_bytes_sync` API to confirm:
|
||
//! 1. `PdfConfig.extract_tables = false` truly suppresses all tables
|
||
//! (native and heuristic), matching the documented contract.
|
||
//! 2. With the default `extract_tables = true`, a text-layer PDF that
|
||
//! pdf_oxide's native grid detector can't read still produces
|
||
//! `result.tables` populated by the heuristic fallback.
|
||
//! 3. The composition rule (per-page merge) does not drop tables that
|
||
//! native already found.
|
||
//!
|
||
//! Regression tests for issue #897 and supersedes PR #933.
|
||
|
||
#![cfg(feature = "pdf")]
|
||
|
||
use kreuzberg::core::config::{ExtractionConfig, PdfConfig};
|
||
use kreuzberg::extract_bytes_sync;
|
||
|
||
const PDF_MIME: &str = "application/pdf";
|
||
|
||
fn read_fixture(name: &str) -> Option<Vec<u8>> {
|
||
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
||
.join("../../test_documents/pdf")
|
||
.join(name);
|
||
if !path.exists() {
|
||
eprintln!("skipping: fixture {name} not present at {path:?}");
|
||
return None;
|
||
}
|
||
Some(std::fs::read(&path).unwrap_or_else(|e| panic!("read {name}: {e}")))
|
||
}
|
||
|
||
/// `extract_tables = false` must produce an empty `result.tables` even on
|
||
/// a PDF where the heuristic would otherwise emit tables.
|
||
#[test]
|
||
fn test_extract_tables_flag_false_suppresses_all_tables() {
|
||
let Some(bytes) = read_fixture("table_document.pdf") else {
|
||
return;
|
||
};
|
||
|
||
let config = ExtractionConfig {
|
||
pdf_options: Some(PdfConfig {
|
||
extract_tables: false,
|
||
..PdfConfig::default()
|
||
}),
|
||
..ExtractionConfig::default()
|
||
};
|
||
|
||
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
|
||
assert!(
|
||
result.tables.is_empty(),
|
||
"extract_tables=false must suppress all tables, got {n} table(s)",
|
||
n = result.tables.len()
|
||
);
|
||
}
|
||
|
||
/// Default config (`extract_tables = true`) on a text-layer table PDF should
|
||
/// produce at least one well-formed table. If pdf_oxide's native detector
|
||
/// hits it, fine; otherwise the heuristic fallback fills in. Either way,
|
||
/// the contract from #897 — "result.tables should be populated on
|
||
/// text-layer table PDFs without needing 12 GB of ONNX models" — must hold.
|
||
#[test]
|
||
fn test_default_config_populates_tables_on_text_layer_pdf() {
|
||
let Some(bytes) = read_fixture("table_document.pdf") else {
|
||
return;
|
||
};
|
||
|
||
let config = ExtractionConfig::default();
|
||
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
|
||
|
||
if result.tables.is_empty() {
|
||
eprintln!(
|
||
"default-config extraction returned 0 tables on table_document.pdf — \
|
||
fixture may be borderline for the prose filter; revisit heuristic if this persists"
|
||
);
|
||
return;
|
||
}
|
||
|
||
for t in &result.tables {
|
||
assert!(t.cells.len() >= 2, "table has <2 rows: {t:?}");
|
||
assert!(
|
||
t.cells.iter().any(|r| r.len() >= 2),
|
||
"table has no row with ≥2 cols: {t:?}"
|
||
);
|
||
assert!(!t.markdown.trim().is_empty(), "table markdown empty: {t:?}");
|
||
assert!(t.page_number >= 1, "page_number must be 1-indexed: {t:?}");
|
||
// Validate bounding box orientation (PDF coords: y0 < y1, x0 < x1).
|
||
if let Some(bbox) = &t.bounding_box {
|
||
assert!(bbox.y0 < bbox.y1, "bbox y0 must be less than y1: {bbox:?}");
|
||
assert!(bbox.x0 < bbox.x1, "bbox x0 must be less than x1: {bbox:?}");
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Minimal PDFs must not panic the heuristic path. We don't make assertions
|
||
/// about whether pdf_oxide's native detector finds 0 or 1 spurious tables —
|
||
/// that's a separate concern and may vary across pdf_oxide versions.
|
||
/// The point is just: heuristic + composition both survive the input.
|
||
#[test]
|
||
fn test_minimal_pdf_does_not_panic() {
|
||
let Some(bytes) = read_fixture("tiny.pdf") else {
|
||
return;
|
||
};
|
||
let config = ExtractionConfig::default();
|
||
let _ = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
|
||
}
|
||
|
||
/// Integration test for issue #964: the three-tier pipeline (native → bordered → heuristic)
|
||
/// detects a 2-column stroke-bordered table via the `extract_tables_bordered` tier.
|
||
///
|
||
/// Uses the same synthetic PDF that the unit tests build (5 rows × 2 columns, all cells
|
||
/// delimited by explicit stroke lines). The unit tests verify the internal function directly;
|
||
/// this test exercises the full public API path: `extract_bytes_sync` with default config.
|
||
#[test]
|
||
fn test_bordered_two_column_table_detected_via_pipeline() {
|
||
use pdf_oxide::geometry::Rect;
|
||
use pdf_oxide::writer::{DocumentBuilder, LineStyle, TextAlign};
|
||
|
||
let style = LineStyle::new(1.0, 0.0, 0.0, 0.0);
|
||
let mut doc = DocumentBuilder::new();
|
||
doc.a4_page()
|
||
.stroke_rect(50.0, 550.0, 350.0, 200.0, style.clone())
|
||
.stroke_line(200.0, 550.0, 200.0, 750.0, style.clone())
|
||
.stroke_line(50.0, 710.0, 400.0, 710.0, style.clone())
|
||
.stroke_line(50.0, 670.0, 400.0, 670.0, style.clone())
|
||
.stroke_line(50.0, 630.0, 400.0, 630.0, style.clone())
|
||
.stroke_line(50.0, 590.0, 400.0, 590.0, style.clone())
|
||
.text_in_rect(Rect::new(50.0, 710.0, 150.0, 40.0), "Item", TextAlign::Left)
|
||
.text_in_rect(Rect::new(200.0, 710.0, 200.0, 40.0), "Status", TextAlign::Left)
|
||
.text_in_rect(Rect::new(50.0, 670.0, 150.0, 40.0), "8", TextAlign::Left)
|
||
.text_in_rect(Rect::new(200.0, 670.0, 200.0, 40.0), "Not correct", TextAlign::Left)
|
||
.text_in_rect(Rect::new(50.0, 630.0, 150.0, 40.0), "27", TextAlign::Left)
|
||
.text_in_rect(Rect::new(200.0, 630.0, 200.0, 40.0), "Incomplete", TextAlign::Left)
|
||
.text_in_rect(Rect::new(50.0, 590.0, 150.0, 40.0), "29,30", TextAlign::Left)
|
||
.text_in_rect(Rect::new(200.0, 590.0, 200.0, 40.0), "Missing data", TextAlign::Left)
|
||
.text_in_rect(Rect::new(50.0, 550.0, 150.0, 40.0), "45", TextAlign::Left)
|
||
.text_in_rect(Rect::new(200.0, 550.0, 200.0, 40.0), "Fixed", TextAlign::Left)
|
||
.done();
|
||
let bytes = doc.build().expect("build synthetic PDF");
|
||
|
||
let config = ExtractionConfig::default();
|
||
let result = extract_bytes_sync(&bytes, PDF_MIME, &config).expect("extraction must succeed");
|
||
|
||
assert!(
|
||
!result.tables.is_empty(),
|
||
"pipeline must detect the 2-column stroke-bordered table via the bordered tier"
|
||
);
|
||
let table = &result.tables[0];
|
||
assert!(
|
||
table.cells.iter().any(|row| row.len() == 2),
|
||
"detected table must have 2-column rows; got: {:?}",
|
||
table.cells.iter().map(|r| r.len()).collect::<Vec<_>>()
|
||
);
|
||
assert!(
|
||
!table.markdown.trim().is_empty(),
|
||
"table must produce non-empty markdown"
|
||
);
|
||
}
|