crates/kreuzberg/tests/pdf_element_classification.rs

use kreuzberg::types::{ElementType, ResultFormat};
use kreuzberg::{ExtractionConfig, OutputFormat, extract_bytes_sync};
use std::path::Path;

/// Verifies that numbered chapter headings in an untagged ReportLab PDF are
/// classified as Heading/Title, not ListItem (#961).
#[test]
fn numbered_chapters_in_untagged_pdf_become_headings() {
    let path = Path::new("test_documents/pdf/multipage_marketing.pdf");
    if !path.exists() {
        eprintln!("skipping: test_documents/pdf/multipage_marketing.pdf not found");
        return;
    }

    let bytes = std::fs::read(path).expect("failed to read PDF");
    let config = ExtractionConfig {
        output_format: OutputFormat::Plain,
        result_format: ResultFormat::ElementBased,
        ..Default::default()
    };

    let result = extract_bytes_sync(&bytes, "application/pdf", &config).expect("extraction failed");
    let elements = result.elements.unwrap_or_default();

    let chapter_list_items: Vec<_> = elements
        .iter()
        .filter(|e| {
            e.element_type == ElementType::ListItem && e.text.chars().next().is_some_and(|c| c.is_ascii_digit())
        })
        .collect();

    let numbered_headings: Vec<_> = elements
        .iter()
        .filter(|e| {
            matches!(e.element_type, ElementType::Heading | ElementType::Title)
                && e.text.chars().next().is_some_and(|c| c.is_ascii_digit())
        })
        .collect();

    assert!(
        chapter_list_items.is_empty(),
        "numbered chapter headings must not be ListItem; got: {:?}",
        chapter_list_items.iter().map(|e| &e.text).collect::<Vec<_>>()
    );
    assert!(
        !numbered_headings.is_empty(),
        "at least one numbered chapter heading must be promoted to Heading/Title"
    );
}
Nomad changes 2026-06-01 23:40:55 +02:00			`use kreuzberg::types::{ElementType, ResultFormat};`
			`use kreuzberg::{ExtractionConfig, OutputFormat, extract_bytes_sync};`
			`use std::path::Path;`

			`/// Verifies that numbered chapter headings in an untagged ReportLab PDF are`
			`/// classified as Heading/Title, not ListItem (#961).`
			`#[test]`
			`fn numbered_chapters_in_untagged_pdf_become_headings() {`
			`let path = Path::new("test_documents/pdf/multipage_marketing.pdf");`
			`if !path.exists() {`
			`eprintln!("skipping: test_documents/pdf/multipage_marketing.pdf not found");`
			`return;`
			`}`

			`let bytes = std::fs::read(path).expect("failed to read PDF");`
			`let config = ExtractionConfig {`
			`output_format: OutputFormat::Plain,`
			`result_format: ResultFormat::ElementBased,`
			`..Default::default()`
			`};`

			`let result = extract_bytes_sync(&bytes, "application/pdf", &config).expect("extraction failed");`
			`let elements = result.elements.unwrap_or_default();`

			`let chapter_list_items: Vec<_> = elements`
			`.iter()`
			`.filter(\|e\| {`
			`e.element_type == ElementType::ListItem && e.text.chars().next().is_some_and(\|c\| c.is_ascii_digit())`
			`})`
			`.collect();`

			`let numbered_headings: Vec<_> = elements`
			`.iter()`
			`.filter(\|e\| {`
			`matches!(e.element_type, ElementType::Heading \| ElementType::Title)`
			`&& e.text.chars().next().is_some_and(\|c\| c.is_ascii_digit())`
			`})`
			`.collect();`

			`assert!(`
			`chapter_list_items.is_empty(),`
			`"numbered chapter headings must not be ListItem; got: {:?}",`
			`chapter_list_items.iter().map(\|e\| &e.text).collect::<Vec<_>>()`
			`);`
			`assert!(`
			`!numbered_headings.is_empty(),`
			`"at least one numbered chapter heading must be promoted to Heading/Title"`
			`);`
			`}`