This commit is contained in:
49
crates/kreuzberg/tests/pdf_element_classification.rs
Normal file
49
crates/kreuzberg/tests/pdf_element_classification.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use kreuzberg::types::{ElementType, ResultFormat};
|
||||
use kreuzberg::{ExtractionConfig, OutputFormat, extract_bytes_sync};
|
||||
use std::path::Path;
|
||||
|
||||
/// Verifies that numbered chapter headings in an untagged ReportLab PDF are
|
||||
/// classified as Heading/Title, not ListItem (#961).
|
||||
#[test]
|
||||
fn numbered_chapters_in_untagged_pdf_become_headings() {
|
||||
let path = Path::new("test_documents/pdf/multipage_marketing.pdf");
|
||||
if !path.exists() {
|
||||
eprintln!("skipping: test_documents/pdf/multipage_marketing.pdf not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let bytes = std::fs::read(path).expect("failed to read PDF");
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Plain,
|
||||
result_format: ResultFormat::ElementBased,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes_sync(&bytes, "application/pdf", &config).expect("extraction failed");
|
||||
let elements = result.elements.unwrap_or_default();
|
||||
|
||||
let chapter_list_items: Vec<_> = elements
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
e.element_type == ElementType::ListItem && e.text.chars().next().is_some_and(|c| c.is_ascii_digit())
|
||||
})
|
||||
.collect();
|
||||
|
||||
let numbered_headings: Vec<_> = elements
|
||||
.iter()
|
||||
.filter(|e| {
|
||||
matches!(e.element_type, ElementType::Heading | ElementType::Title)
|
||||
&& e.text.chars().next().is_some_and(|c| c.is_ascii_digit())
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(
|
||||
chapter_list_items.is_empty(),
|
||||
"numbered chapter headings must not be ListItem; got: {:?}",
|
||||
chapter_list_items.iter().map(|e| &e.text).collect::<Vec<_>>()
|
||||
);
|
||||
assert!(
|
||||
!numbered_headings.is_empty(),
|
||||
"at least one numbered chapter heading must be promoted to Heading/Title"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user