103 lines
3.4 KiB
Rust
103 lines
3.4 KiB
Rust
//! Integration tests verifying that chunk `first_page`/`last_page` are populated
|
|
//! for all chunks when extracting multi-page PDFs with chunking enabled.
|
|
|
|
#![cfg(all(feature = "pdf", feature = "chunking"))]
|
|
|
|
mod helpers;
|
|
|
|
use helpers::*;
|
|
use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig};
|
|
use kreuzberg::extract_file_sync;
|
|
|
|
/// All chunks produced from a multi-page PDF must have non-null page metadata.
|
|
///
|
|
/// Verifies that `recompute_boundaries_from_pages` successfully locates every
|
|
/// page's content inside `result.content` so the chunker receives valid byte
|
|
/// offsets and can populate `first_page`/`last_page` on every chunk.
|
|
#[test]
|
|
fn chunks_from_multi_page_pdf_all_have_page_metadata() {
|
|
if skip_if_missing("pdf/multi_page.pdf") {
|
|
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
chunking: Some(ChunkingConfig {
|
|
max_characters: 500,
|
|
overlap: 50,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
|
|
.expect("multi_page.pdf extraction should succeed");
|
|
|
|
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
|
|
|
|
assert!(!chunks.is_empty(), "multi-page PDF should produce at least one chunk");
|
|
|
|
let null_page_chunks: Vec<_> = chunks
|
|
.iter()
|
|
.filter(|c| c.metadata.first_page.is_none() || c.metadata.last_page.is_none())
|
|
.collect();
|
|
|
|
assert!(
|
|
null_page_chunks.is_empty(),
|
|
"{} of {} chunks have null page metadata (first_page or last_page is None). \
|
|
Chunk indices with null metadata: {:?}",
|
|
null_page_chunks.len(),
|
|
chunks.len(),
|
|
null_page_chunks
|
|
.iter()
|
|
.map(|c| c.metadata.chunk_index)
|
|
.collect::<Vec<_>>()
|
|
);
|
|
}
|
|
|
|
/// Chunks from a multi-page PDF must have monotonically non-decreasing page numbers.
|
|
///
|
|
/// Verifies that page boundaries are contiguous and in order — a secondary property
|
|
/// that would be violated if `recompute_boundaries_from_pages` miscalculated
|
|
/// `search_offset` for any page.
|
|
#[test]
|
|
fn chunks_from_multi_page_pdf_have_monotonic_page_numbers() {
|
|
if skip_if_missing("pdf/multi_page.pdf") {
|
|
eprintln!("skipping: fixture pdf/multi_page.pdf not found");
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
chunking: Some(ChunkingConfig {
|
|
max_characters: 500,
|
|
overlap: 50,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
};
|
|
|
|
let result = extract_file_sync(get_test_file_path("pdf/multi_page.pdf"), None, &config)
|
|
.expect("multi_page.pdf extraction should succeed");
|
|
|
|
let chunks = result.chunks.expect("chunking was configured — chunks must be present");
|
|
|
|
assert!(
|
|
chunks.iter().all(|c| c.metadata.first_page.is_some()),
|
|
"all chunks must have first_page before checking order"
|
|
);
|
|
|
|
let mut prev_first_page = 0u32;
|
|
for chunk in &chunks {
|
|
if let Some(first) = chunk.metadata.first_page {
|
|
assert!(
|
|
first >= prev_first_page,
|
|
"chunk {} first_page ({}) must be >= previous first_page ({})",
|
|
chunk.metadata.chunk_index,
|
|
first,
|
|
prev_first_page
|
|
);
|
|
prev_first_page = first;
|
|
}
|
|
}
|
|
}
|