Files
fil/crates/kreuzberg/tests/pdf_chunk_page_metadata.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

103 lines
3.4 KiB
Rust

//! Integration tests verifying that chunk `first_page`/`last_page` are populated
//! for all chunks when extracting multi-page PDFs with chunking enabled.
#![cfg(all(feature = "pdf", feature = "chunking"))]
mod helpers;
use helpers::*;
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
use kreuzberg::extract_file_sync;
/// All chunks produced from a multi-page PDF must have non-null page metadata.
///
/// Verifies that `recompute_boundaries_from_pages` successfully locates every
/// page's content inside `result.content` so the chunker receives valid byte
/// offsets and can populate `first_page`/`last_page` on every chunk.
#[test]
fn chunks_from_multi_page_pdf_all_have_page_metadata() {
if skip_if_missing("pdf/multi_page.pdf") {
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
return;
}
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
.expect("multi_page.pdf extraction should succeed");
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
assert!(!chunks.is_empty(), "multi-page PDF should produce at least one chunk");
let null_page_chunks: Vec<_> = chunks
.iter()
.filter(|c| c.metadata.first_page.is_none() || c.metadata.last_page.is_none())
.collect();
assert!(
null_page_chunks.is_empty(),
"{} of {} chunks have null page metadata (first_page or last_page is None). \
Chunk indices with null metadata: {:?}",
null_page_chunks.len(),
chunks.len(),
null_page_chunks
.iter()
.map(|c| c.metadata.chunk_index)
.collect::<Vec<_>>()
);
}
/// Chunks from a multi-page PDF must have monotonically non-decreasing page numbers.
///
/// Verifies that page boundaries are contiguous and in order — a secondary property
/// that would be violated if `recompute_boundaries_from_pages` miscalculated
/// `search_offset` for any page.
#[test]
fn chunks_from_multi_page_pdf_have_monotonic_page_numbers() {
if skip_if_missing("pdf/multi_page.pdf") {
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
return;
}
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
.expect("multi_page.pdf extraction should succeed");
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
assert!(
chunks.iter().all(|c| c.metadata.first_page.is_some()),
"all chunks must have first_page before checking order"
);
let mut prev_first_page = 0u32;
for chunk in &chunks {
if let Some(first) = chunk.metadata.first_page {
assert!(
first >= prev_first_page,
"chunk {} first_page ({}) must be >= previous first_page ({})",
chunk.metadata.chunk_index,
first,
prev_first_page
);
prev_first_page = first;
}
}
}