This commit is contained in:
102
crates/kreuzberg/tests/pdf_chunk_page_metadata.rs
Normal file
102
crates/kreuzberg/tests/pdf_chunk_page_metadata.rs
Normal file
@@ -0,0 +1,102 @@
|
||||
//! Integration tests verifying that chunk `first_page`/`last_page` are populated
|
||||
//! for all chunks when extracting multi-page PDFs with chunking enabled.
|
||||
|
||||
#![cfg(all(feature = "pdf", feature = "chunking"))]
|
||||
|
||||
mod helpers;
|
||||
|
||||
use helpers::*;
|
||||
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
/// All chunks produced from a multi-page PDF must have non-null page metadata.
|
||||
///
|
||||
/// Verifies that `recompute_boundaries_from_pages` successfully locates every
|
||||
/// page's content inside `result.content` so the chunker receives valid byte
|
||||
/// offsets and can populate `first_page`/`last_page` on every chunk.
|
||||
#[test]
|
||||
fn chunks_from_multi_page_pdf_all_have_page_metadata() {
|
||||
if skip_if_missing("pdf/multi_page.pdf") {
|
||||
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
|
||||
.expect("multi_page.pdf extraction should succeed");
|
||||
|
||||
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
|
||||
|
||||
assert!(!chunks.is_empty(), "multi-page PDF should produce at least one chunk");
|
||||
|
||||
let null_page_chunks: Vec<_> = chunks
|
||||
.iter()
|
||||
.filter(|c| c.metadata.first_page.is_none() || c.metadata.last_page.is_none())
|
||||
.collect();
|
||||
|
||||
assert!(
|
||||
null_page_chunks.is_empty(),
|
||||
"{} of {} chunks have null page metadata (first_page or last_page is None). \
|
||||
Chunk indices with null metadata: {:?}",
|
||||
null_page_chunks.len(),
|
||||
chunks.len(),
|
||||
null_page_chunks
|
||||
.iter()
|
||||
.map(|c| c.metadata.chunk_index)
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
/// Chunks from a multi-page PDF must have monotonically non-decreasing page numbers.
|
||||
///
|
||||
/// Verifies that page boundaries are contiguous and in order — a secondary property
|
||||
/// that would be violated if `recompute_boundaries_from_pages` miscalculated
|
||||
/// `search_offset` for any page.
|
||||
#[test]
|
||||
fn chunks_from_multi_page_pdf_have_monotonic_page_numbers() {
|
||||
if skip_if_missing("pdf/multi_page.pdf") {
|
||||
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 500,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
|
||||
.expect("multi_page.pdf extraction should succeed");
|
||||
|
||||
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
|
||||
|
||||
assert!(
|
||||
chunks.iter().all(|c| c.metadata.first_page.is_some()),
|
||||
"all chunks must have first_page before checking order"
|
||||
);
|
||||
|
||||
let mut prev_first_page = 0u32;
|
||||
for chunk in &chunks {
|
||||
if let Some(first) = chunk.metadata.first_page {
|
||||
assert!(
|
||||
first >= prev_first_page,
|
||||
"chunk {} first_page ({}) must be >= previous first_page ({})",
|
||||
chunk.metadata.chunk_index,
|
||||
first,
|
||||
prev_first_page
|
||||
);
|
||||
prev_first_page = first;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user