1317 lines
50 KiB
Rust
1317 lines
50 KiB
Rust
|
|
//! Regression tests for PDF image extraction in markdown output.
|
||
|
|
//!
|
||
|
|
//! Verifies that embedded images in PDFs produce proper ``
|
||
|
|
//! references instead of empty `![]()` placeholders.
|
||
|
|
|
||
|
|
#![cfg(feature = "pdf")]
|
||
|
|
|
||
|
|
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
||
|
|
use kreuzberg::core::extractor::extract_file;
|
||
|
|
use std::path::PathBuf;
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
fn test_documents_dir() -> PathBuf {
|
||
|
|
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.parent()
|
||
|
|
.unwrap()
|
||
|
|
.join("test_documents")
|
||
|
|
}
|
||
|
|
|
||
|
|
fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult {
|
||
|
|
use kreuzberg::core::config::ImageExtractionConfig;
|
||
|
|
let path = test_documents_dir().join(relative_path);
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
rt.block_on(extract_file(&path, None, &config)).unwrap()
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_multipage_marketing_no_empty_image_refs() {
|
||
|
|
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
||
|
|
let content = &result.content;
|
||
|
|
|
||
|
|
// Must not contain empty image references
|
||
|
|
assert!(
|
||
|
|
!content.contains("![]()"),
|
||
|
|
"Markdown output must not contain empty image references ![](), got:\n{}",
|
||
|
|
content
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_multipage_marketing_has_image_refs() {
|
||
|
|
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
||
|
|
let content = &result.content;
|
||
|
|
|
||
|
|
// Must contain at least one proper image reference
|
||
|
|
assert!(
|
||
|
|
content.contains(",
|
||
|
|
"Markdown output must contain image references like , got:\n{}",
|
||
|
|
content
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_multipage_marketing_images_populated() {
|
||
|
|
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
||
|
|
|
||
|
|
// Extraction result must have images with actual data
|
||
|
|
let images = result.images.as_ref().expect("images field must be Some");
|
||
|
|
assert!(!images.is_empty(), "Extraction result must contain extracted images");
|
||
|
|
|
||
|
|
// At least some images should have non-empty data
|
||
|
|
let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count();
|
||
|
|
assert!(
|
||
|
|
images_with_data > 0,
|
||
|
|
"At least some images should have actual pixel data, got {} images total but none with data",
|
||
|
|
images.len()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_docling_no_empty_image_refs() {
|
||
|
|
let result = extract_markdown("pdf/docling.pdf");
|
||
|
|
let content = &result.content;
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!content.contains("![]()"),
|
||
|
|
"Docling markdown must not contain empty image references ![](), got:\n{}",
|
||
|
|
content
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_docling_has_image_refs() {
|
||
|
|
let result = extract_markdown("pdf/docling.pdf");
|
||
|
|
let content = &result.content;
|
||
|
|
|
||
|
|
// Docling has at least 1 figure
|
||
|
|
assert!(
|
||
|
|
content.contains(",
|
||
|
|
"Docling markdown must contain image references, got:\n{}",
|
||
|
|
content
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_docling_content_quality() {
|
||
|
|
let result = extract_markdown("pdf/docling.pdf");
|
||
|
|
let content = &result.content;
|
||
|
|
|
||
|
|
// Verify key content from the Docling technical report is present
|
||
|
|
assert!(content.contains("Docling"), "Must contain 'Docling'");
|
||
|
|
assert!(content.contains("PDF"), "Must contain 'PDF'");
|
||
|
|
assert!(
|
||
|
|
content.contains("table structure recognition") || content.contains("TableFormer"),
|
||
|
|
"Must mention table structure recognition or TableFormer"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression test for issue #752: structured output was ~1000x slower than text
|
||
|
|
/// on Ghostscript-produced PDFs with many inline images (~1,924 per page).
|
||
|
|
///
|
||
|
|
/// Root cause: `populate_images_from_oxide` used `Vec::contains` (O(N)) inside
|
||
|
|
/// the per-page object loop — O(N²) total. Fixed by converting to `AHashSet` for
|
||
|
|
/// O(1) lookup before the loop.
|
||
|
|
///
|
||
|
|
/// This test skips when the repro file is absent (it is not committed to the
|
||
|
|
/// repository due to size). To reproduce locally, generate a Ghostscript vector
|
||
|
|
/// decomposition PDF and place it at:
|
||
|
|
/// test_documents/pdf/ghostscript_inline_images_repro.pdf
|
||
|
|
#[test]
|
||
|
|
fn test_ghostscript_inline_images_completes_in_reasonable_time() {
|
||
|
|
let path = test_documents_dir().join("pdf/ghostscript_inline_images_repro.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/ghostscript_inline_images_repro.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = kreuzberg::core::config::ExtractionConfig {
|
||
|
|
output_format: kreuzberg::core::config::OutputFormat::Markdown,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
|
||
|
|
let start = std::time::Instant::now();
|
||
|
|
let result = rt
|
||
|
|
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.expect("extraction must succeed for Ghostscript inline-image PDF");
|
||
|
|
let elapsed = start.elapsed();
|
||
|
|
|
||
|
|
// Before the fix, a single-page PDF with ~1,924 inline images took ~56 seconds.
|
||
|
|
// After the fix it should complete in well under 10 seconds even on slow CI.
|
||
|
|
assert!(
|
||
|
|
elapsed.as_secs() < 10,
|
||
|
|
"Ghostscript inline-image PDF must extract in under 10 seconds, took {elapsed:?}"
|
||
|
|
);
|
||
|
|
|
||
|
|
// The file has no text — content may be empty or minimal; that is expected.
|
||
|
|
let _ = result;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Regression tests for issue #796 ────────────────────────────────────────
|
||
|
|
//
|
||
|
|
// Before the fix, setting `images.extract_images = false` (or
|
||
|
|
// `pdf_options.extract_images = false`) still caused full base64 image data to
|
||
|
|
// appear in `ExtractionResult.images` when `output_format` was `Markdown` or
|
||
|
|
// `Djot`. The root cause was that `inject_placeholders` in `extraction.rs`
|
||
|
|
// defaulted to `true` without checking `extract_images`, allowing the structure
|
||
|
|
// pipeline to call `populate_images_from_oxide` unconditionally.
|
||
|
|
|
||
|
|
/// Helper: extract with a specific output format and images explicitly disabled
|
||
|
|
/// via `ImageExtractionConfig.extract_images = false`.
|
||
|
|
fn extract_no_images(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult {
|
||
|
|
use kreuzberg::core::config::ImageExtractionConfig;
|
||
|
|
let path = test_documents_dir().join(relative_path);
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: fmt,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.unwrap()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Helper: extract with a specific output format and images disabled via
|
||
|
|
/// `PdfConfig.extract_images = false`.
|
||
|
|
fn extract_no_images_via_pdf_options(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult {
|
||
|
|
use kreuzberg::core::config::pdf::PdfConfig;
|
||
|
|
let path = test_documents_dir().join(relative_path);
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: fmt,
|
||
|
|
pdf_options: Some(PdfConfig {
|
||
|
|
extract_images: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.unwrap()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression #796: images must be absent when extract_images=false, output_format=Markdown.
|
||
|
|
///
|
||
|
|
/// Uses `embedded_images_tables.pdf` — a known-image PDF. Before the fix, this
|
||
|
|
/// returned `ExtractionResult.images` with full base64 data despite the flag.
|
||
|
|
#[test]
|
||
|
|
fn test_regression_796_markdown_no_images_when_disabled_via_images_config() {
|
||
|
|
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
|
||
|
|
"images.extract_images=false must produce an empty images list even for \
|
||
|
|
output_format=Markdown. Got {} image(s).",
|
||
|
|
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
|
||
|
|
);
|
||
|
|
// Confirm the text content was still extracted (no regression on content).
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Content must still be extracted when images are disabled"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression #796: same assertion for Djot output format.
|
||
|
|
#[test]
|
||
|
|
fn test_regression_796_djot_no_images_when_disabled_via_images_config() {
|
||
|
|
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
|
||
|
|
"images.extract_images=false must produce an empty images list even for \
|
||
|
|
output_format=Djot. Got {} image(s).",
|
||
|
|
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression #796: the pdf_options.extract_images path must also be respected
|
||
|
|
/// when output_format=Markdown.
|
||
|
|
#[test]
|
||
|
|
fn test_regression_796_markdown_no_images_when_disabled_via_pdf_options() {
|
||
|
|
let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
|
||
|
|
"pdf_options.extract_images=false must produce an empty images list even for \
|
||
|
|
output_format=Markdown. Got {} image(s).",
|
||
|
|
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Sanity check: images must still appear when extract_images=true (no regression).
|
||
|
|
#[test]
|
||
|
|
fn test_regression_796_markdown_images_present_when_enabled() {
|
||
|
|
use kreuzberg::core::config::ImageExtractionConfig;
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt
|
||
|
|
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.unwrap();
|
||
|
|
let images = result
|
||
|
|
.images
|
||
|
|
.as_ref()
|
||
|
|
.expect("images must be Some when extract_images=true");
|
||
|
|
assert!(
|
||
|
|
!images.is_empty(),
|
||
|
|
"images list must be non-empty when extract_images=true and the PDF contains images"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Plain-text baseline: images must never appear for plain output (already passing
|
||
|
|
/// before the fix; kept as a safety net).
|
||
|
|
#[test]
|
||
|
|
fn test_regression_796_plain_no_images_when_disabled() {
|
||
|
|
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Plain);
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
|
||
|
|
"Plain output with extract_images=false must have no images. Got {} image(s).",
|
||
|
|
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Content-level image suppression tests ───────────────────────────────────
|
||
|
|
//
|
||
|
|
// The earlier #796 tests only assert `result.images.is_empty()`. That field is
|
||
|
|
// gated separately (extraction.rs:112) and is always empty when
|
||
|
|
// `extract_images=false`, even if the `inject_placeholders` guard at line 216 is
|
||
|
|
// removed. The guard controls whether `ElementKind::Image` elements are injected
|
||
|
|
// into the InternalDocument — which in turn controls whether image placeholder
|
||
|
|
// references (`![]()` / ``) appear in `result.content`.
|
||
|
|
//
|
||
|
|
// The Djot renderer (`djot.rs`) lacked the `doc.images.get()` None check that
|
||
|
|
// comrak_bridge, html_styled, and plain all have. Removing the guard would cause
|
||
|
|
// `![]()` to leak into Djot content with no test catching it.
|
||
|
|
//
|
||
|
|
// JSON renderer gap (out of scope): json.rs emits `{"type":"image","alt":null,"src":null}`
|
||
|
|
// for orphaned elements — null fields are valid structured JSON and produce no broken
|
||
|
|
// markup, so it is intentionally not addressed here.
|
||
|
|
|
||
|
|
/// Djot content must not contain image markup when `extract_images=false`.
|
||
|
|
///
|
||
|
|
/// End-to-end contract test: requires both the `inject_placeholders` guard in
|
||
|
|
/// `extraction.rs` AND the Djot renderer's `None` guard to be absent before it
|
||
|
|
/// fails. The renderer-level unit test `test_djot_renderer_skips_orphaned_image_element`
|
||
|
|
/// in `djot.rs` is the minimal proof that the renderer fix works independently.
|
||
|
|
#[test]
|
||
|
|
fn test_djot_content_has_no_image_refs_when_extraction_disabled() {
|
||
|
|
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains("![]()"),
|
||
|
|
"Djot output must not contain empty ![]() refs when extract_images=false.\n\
|
||
|
|
Got content:\n{}",
|
||
|
|
&result.content[..result.content.len().min(400)]
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains(",
|
||
|
|
"Djot output must not contain image placeholder refs when extract_images=false.\n\
|
||
|
|
Got content:\n{}",
|
||
|
|
&result.content[..result.content.len().min(400)]
|
||
|
|
);
|
||
|
|
// Text content must still be present — no regression on extraction.
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"Djot content must not be empty when images are disabled"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Markdown content must not contain image markup when `extract_images=false`.
|
||
|
|
///
|
||
|
|
/// comrak_bridge already has a None guard so this would pass even without the
|
||
|
|
/// extraction-level guard, but it pins the end-to-end contract explicitly.
|
||
|
|
#[test]
|
||
|
|
fn test_markdown_content_has_no_image_refs_when_extraction_disabled() {
|
||
|
|
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains("![]()"),
|
||
|
|
"Markdown output must not contain empty ![]() refs when extract_images=false.\n\
|
||
|
|
Got content:\n{}",
|
||
|
|
&result.content[..result.content.len().min(400)]
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains(",
|
||
|
|
"Markdown output must not contain image placeholder refs when extract_images=false.\n\
|
||
|
|
Got content:\n{}",
|
||
|
|
&result.content[..result.content.len().min(400)]
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Djot content must not contain image markup when disabled via `pdf_options.extract_images`.
|
||
|
|
///
|
||
|
|
/// Verifies both config paths are covered — mirrors the existing `result.images`
|
||
|
|
/// test for the pdf_options path.
|
||
|
|
#[test]
|
||
|
|
fn test_djot_content_has_no_image_refs_when_disabled_via_pdf_options() {
|
||
|
|
let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains("![]()"),
|
||
|
|
"Djot output (pdf_options path) must not contain ![]() when extract_images=false"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!result.content.contains(",
|
||
|
|
"Djot output (pdf_options path) must not contain image refs when extract_images=false"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Page-level and chunk-level image index references ────────────────────────
|
||
|
|
//
|
||
|
|
// Pages carry `image_indices: Vec<usize>` — zero-based indices into the
|
||
|
|
// top-level `ExtractionResult.images` collection. Chunks carry the same field.
|
||
|
|
|
||
|
|
fn extract_with_pages_and_images(relative_path: &str) -> kreuzberg::types::ExtractionResult {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, PageConfig};
|
||
|
|
let path = test_documents_dir().join(relative_path);
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
pages: Some(PageConfig {
|
||
|
|
extract_pages: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
rt.block_on(extract_file(&path, None, &config)).unwrap()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Pages that contain images must have non-empty `image_indices` pointing into
|
||
|
|
/// `ExtractionResult.images`. Every index must be in-bounds.
|
||
|
|
#[test]
|
||
|
|
fn test_page_image_indices_are_valid_when_images_extracted() {
|
||
|
|
let result = extract_with_pages_and_images("pdf/embedded_images_tables.pdf");
|
||
|
|
|
||
|
|
let images = result.images.as_ref().expect("images must be Some");
|
||
|
|
assert!(!images.is_empty(), "fixture must have extracted images");
|
||
|
|
|
||
|
|
let pages = result
|
||
|
|
.pages
|
||
|
|
.as_ref()
|
||
|
|
.expect("pages must be Some when extract_pages=true");
|
||
|
|
assert!(!pages.is_empty(), "fixture must have pages");
|
||
|
|
|
||
|
|
// At least one page must carry image_indices (not all pages need images).
|
||
|
|
let pages_with_images: Vec<_> = pages.iter().filter(|p| !p.image_indices.is_empty()).collect();
|
||
|
|
assert!(
|
||
|
|
!pages_with_images.is_empty(),
|
||
|
|
"at least one page must have image_indices populated when the PDF contains images"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Every index must be in-bounds and the referenced image must report
|
||
|
|
// belonging to this page (cross-validation: wrong-page bugs would pass a
|
||
|
|
// bounds-only check).
|
||
|
|
for page in pages {
|
||
|
|
for &idx in &page.image_indices {
|
||
|
|
assert!(
|
||
|
|
(idx as usize) < images.len(),
|
||
|
|
"page {} image_indices[{}] = {} is out of bounds (images.len() = {})",
|
||
|
|
page.page_number,
|
||
|
|
idx,
|
||
|
|
idx,
|
||
|
|
images.len()
|
||
|
|
);
|
||
|
|
let img_page = images[idx as usize].page_number;
|
||
|
|
assert_eq!(
|
||
|
|
img_page,
|
||
|
|
Some(page.page_number),
|
||
|
|
"image at index {} has page_number {:?} but is referenced by page {}",
|
||
|
|
idx,
|
||
|
|
img_page,
|
||
|
|
page.page_number
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `image_indices` on pages must be empty when image extraction is disabled.
|
||
|
|
#[test]
|
||
|
|
fn test_page_image_indices_empty_when_images_disabled() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, PageConfig};
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
pages: Some(PageConfig {
|
||
|
|
extract_pages: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
|
||
|
|
|
||
|
|
if let Some(pages) = result.pages.as_ref() {
|
||
|
|
for page in pages {
|
||
|
|
assert!(
|
||
|
|
page.image_indices.is_empty(),
|
||
|
|
"page {} must have no image_indices when extract_images=false",
|
||
|
|
page.page_number
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(feature = "chunking")]
|
||
|
|
fn extract_with_pages_images_and_chunks(relative_path: &str) -> kreuzberg::types::ExtractionResult {
|
||
|
|
use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig, PageConfig};
|
||
|
|
let path = test_documents_dir().join(relative_path);
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
pages: Some(PageConfig {
|
||
|
|
extract_pages: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
chunking: Some(ChunkingConfig {
|
||
|
|
max_characters: 500,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
rt.block_on(extract_file(&path, None, &config)).unwrap()
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Chunks that span pages containing images must have non-empty `image_indices`.
|
||
|
|
/// Every index must be in-bounds, and the referenced image's `page_number` must
|
||
|
|
/// fall within the chunk's `[first_page, last_page]` range.
|
||
|
|
#[cfg(feature = "chunking")]
|
||
|
|
#[test]
|
||
|
|
fn test_chunk_image_indices_are_valid_when_images_extracted() {
|
||
|
|
let result = extract_with_pages_images_and_chunks("pdf/embedded_images_tables.pdf");
|
||
|
|
|
||
|
|
let images = result.images.as_ref().expect("images must be Some");
|
||
|
|
assert!(!images.is_empty(), "fixture must have extracted images");
|
||
|
|
|
||
|
|
let chunks = result
|
||
|
|
.chunks
|
||
|
|
.as_ref()
|
||
|
|
.expect("chunks must be Some when chunking is configured");
|
||
|
|
assert!(!chunks.is_empty(), "fixture must produce chunks");
|
||
|
|
|
||
|
|
// At least one chunk must carry image_indices.
|
||
|
|
let chunks_with_images: Vec<_> = chunks.iter().filter(|c| !c.metadata.image_indices.is_empty()).collect();
|
||
|
|
assert!(
|
||
|
|
!chunks_with_images.is_empty(),
|
||
|
|
"at least one chunk must have image_indices when the PDF contains images"
|
||
|
|
);
|
||
|
|
|
||
|
|
for chunk in chunks {
|
||
|
|
for &idx in &chunk.metadata.image_indices {
|
||
|
|
// In-bounds check.
|
||
|
|
assert!(
|
||
|
|
(idx as usize) < images.len(),
|
||
|
|
"chunk image_indices[{}] = {} is out of bounds (images.len() = {})",
|
||
|
|
idx,
|
||
|
|
idx,
|
||
|
|
images.len()
|
||
|
|
);
|
||
|
|
|
||
|
|
// Cross-validation: referenced image must belong to a page within
|
||
|
|
// the chunk's page range.
|
||
|
|
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
|
||
|
|
let img_page = images[idx as usize]
|
||
|
|
.page_number
|
||
|
|
.expect("image referenced by a chunk must have a page_number set");
|
||
|
|
assert!(
|
||
|
|
img_page >= first && img_page <= last,
|
||
|
|
"image at index {} is on page {} but chunk covers pages [{}, {}]",
|
||
|
|
idx,
|
||
|
|
img_page,
|
||
|
|
first,
|
||
|
|
last
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression for #985: max_images_per_page must cap the output count per page.
|
||
|
|
///
|
||
|
|
/// Before the fix, `extract_image_positions` ran a complete decompression pass
|
||
|
|
/// over every page unconditionally (even when extract_images=false), then
|
||
|
|
/// `extract_images_with_data` ran a second pass. The `.take(N)` limit only
|
||
|
|
/// clipped the returned slice — it did not stop the decompression work.
|
||
|
|
///
|
||
|
|
/// After the fix:
|
||
|
|
/// - When extract_images=false, NO decompression occurs at all (the main hang fix).
|
||
|
|
/// - When extract_images=true, a single pass runs and the cap is respected in output.
|
||
|
|
/// The per-page decompression cost for images beyond the cap is a pdf_oxide
|
||
|
|
/// upstream limitation: `extract_images()` is eager. Eliminating that
|
||
|
|
/// remaining cost requires a count-limited API upstream.
|
||
|
|
#[test]
|
||
|
|
fn test_max_images_per_page_cap_respected_in_output() {
|
||
|
|
use kreuzberg::core::config::ImageExtractionConfig;
|
||
|
|
use std::collections::HashMap;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("skipping: test PDF not present at {}", path.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let cap: u32 = 5;
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
max_images_per_page: Some(cap),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt
|
||
|
|
.block_on(extract_file(&path, None, &config))
|
||
|
|
.expect("extraction must succeed");
|
||
|
|
|
||
|
|
let images = result
|
||
|
|
.images
|
||
|
|
.as_ref()
|
||
|
|
.expect("images must be Some when extract_images=true");
|
||
|
|
|
||
|
|
// Cap must be respected per page in the output.
|
||
|
|
let mut per_page: HashMap<u32, usize> = HashMap::new();
|
||
|
|
for img in images {
|
||
|
|
*per_page.entry(img.page_number.unwrap_or(1)).or_default() += 1;
|
||
|
|
}
|
||
|
|
for (page, count) in &per_page {
|
||
|
|
assert!(
|
||
|
|
*count <= cap as usize,
|
||
|
|
"page {page} has {count} images; cap={cap} must be respected"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression for #985 (no-images case): when extract_images=false, no images
|
||
|
|
/// are returned and the result is consistent with the fix.
|
||
|
|
///
|
||
|
|
/// Before the fix, `extract_image_positions` ran unconditionally and triggered
|
||
|
|
/// a full decompression pass over every image on every page — even when the
|
||
|
|
/// caller never asked for image data. After the fix the decompression path is
|
||
|
|
/// skipped entirely when images are not requested.
|
||
|
|
#[test]
|
||
|
|
fn test_no_images_returned_when_extraction_disabled_on_dense_pdf() {
|
||
|
|
let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("skipping: test PDF not present at {}", path.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = ExtractionConfig::default(); // extract_images defaults to false
|
||
|
|
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt
|
||
|
|
.block_on(extract_file(&path, None, &config))
|
||
|
|
.expect("extraction must succeed");
|
||
|
|
|
||
|
|
// No images should be returned when extraction is disabled.
|
||
|
|
assert!(
|
||
|
|
result.images.is_none() || result.images.as_ref().is_some_and(|v| v.is_empty()),
|
||
|
|
"images must be absent when extract_images=false"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Positions derived from extracted data must be consistent with the Markdown placeholders.
|
||
|
|
///
|
||
|
|
/// When inject_placeholders=true, the renderer emits `` links where N
|
||
|
|
/// is the image_index. Every such N must have a corresponding entry in result.images.
|
||
|
|
/// Also verifies that image_index values are unique — the derivation loop must not emit
|
||
|
|
/// duplicate global indices.
|
||
|
|
#[test]
|
||
|
|
fn test_image_positions_consistent_with_image_data() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OutputFormat};
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: true,
|
||
|
|
inject_placeholders: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
|
||
|
|
|
||
|
|
let images = match result.images.as_ref() {
|
||
|
|
Some(imgs) if !imgs.is_empty() => imgs,
|
||
|
|
_ => return, // no images in this PDF — nothing to verify
|
||
|
|
};
|
||
|
|
|
||
|
|
// image_index values must be unique across the returned set.
|
||
|
|
let mut seen = std::collections::HashSet::new();
|
||
|
|
for img in images {
|
||
|
|
assert!(
|
||
|
|
seen.insert(img.image_index),
|
||
|
|
"image_index {} appears more than once — position derivation emitted duplicates",
|
||
|
|
img.image_index
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Every `` placeholder in Markdown must resolve to an index in
|
||
|
|
// result.images. This would fail if inject_placeholders emitted a reference for
|
||
|
|
// an image that was never extracted (orphaned placeholder).
|
||
|
|
let known: std::collections::HashSet<u32> = images.iter().map(|i| i.image_index).collect();
|
||
|
|
let re = regex::Regex::new(r"!\[\]\(image_(\d+)\.[a-z]+\)").unwrap();
|
||
|
|
for cap in re.captures_iter(&result.content) {
|
||
|
|
let idx: u32 = cap[1].parse().unwrap();
|
||
|
|
assert!(
|
||
|
|
known.contains(&idx),
|
||
|
|
"Markdown contains `` but result.images has no entry \
|
||
|
|
with image_index={idx} — orphaned placeholder"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Regression for #985 (double-decompression fix): the text-only extraction path must
|
||
|
|
/// skip `extract_images_with_data` entirely.
|
||
|
|
///
|
||
|
|
/// When `extract_images` is `false` (the default), `extraction.rs` must not enter the
|
||
|
|
/// images branch at all — verified here by confirming that `result.images` is `None`
|
||
|
|
/// (or empty) and that the call completes without decompressing any image data.
|
||
|
|
/// This is the minimal structural proof that the guard in `extraction.rs` works:
|
||
|
|
/// if `extract_images_with_data` were called unconditionally, the result would be
|
||
|
|
/// `Some(non_empty_vec)` for a PDF that actually contains images.
|
||
|
|
#[test]
|
||
|
|
fn test_no_decompression_when_images_disabled() {
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
assert!(path.exists(), "missing fixture: {}", path.display());
|
||
|
|
|
||
|
|
// Default config: extract_images defaults to false.
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt
|
||
|
|
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.expect("extraction must succeed");
|
||
|
|
|
||
|
|
// The text-only path must not return any image data.
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().is_none_or(|v| v.is_empty()),
|
||
|
|
"images must be absent on text-only extraction (extract_images=false). \
|
||
|
|
Got {} image(s) — extract_images_with_data was called when it should not have been.",
|
||
|
|
result.images.as_ref().map_or(0, |v| v.len())
|
||
|
|
);
|
||
|
|
|
||
|
|
// Text content must still be present — no regression on the extraction itself.
|
||
|
|
assert!(
|
||
|
|
!result.content.is_empty(),
|
||
|
|
"text content must still be extracted when images are disabled"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Trace-span assertion for #985: `extract_images_with_data` must NOT be entered
|
||
|
|
/// when `extract_images` is false (the default).
|
||
|
|
///
|
||
|
|
/// This directly proves the decompression code path was skipped — complementing
|
||
|
|
/// `test_no_decompression_when_images_disabled` which only observes the output.
|
||
|
|
/// An event with target `kreuzberg::pdf::oxide::images` and field
|
||
|
|
/// `event = "decompression_started"` is emitted at the top of
|
||
|
|
/// `extract_images_with_data`; absence of that event is structural proof the
|
||
|
|
/// function was not called.
|
||
|
|
#[test]
|
||
|
|
fn test_no_decompression_trace_when_images_disabled() {
|
||
|
|
use std::sync::{Arc, Mutex};
|
||
|
|
use tracing_subscriber::{EnvFilter, layer::SubscriberExt as _};
|
||
|
|
|
||
|
|
// ── Captured-event layer ────────────────────────────────────────────────
|
||
|
|
|
||
|
|
#[allow(clippy::type_complexity)]
|
||
|
|
#[derive(Clone, Default)]
|
||
|
|
struct EventCapture {
|
||
|
|
events: Arc<Mutex<Vec<(String, Option<String>)>>>,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<S> tracing_subscriber::Layer<S> for EventCapture
|
||
|
|
where
|
||
|
|
S: tracing::Subscriber,
|
||
|
|
{
|
||
|
|
fn on_event(&self, event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>) {
|
||
|
|
let target = event.metadata().target().to_owned();
|
||
|
|
|
||
|
|
// Only record events from our target to avoid unbounded accumulation.
|
||
|
|
if target != "kreuzberg::pdf::oxide::images" {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Walk the fields to capture the `event` key if present.
|
||
|
|
struct FieldVisitor(Option<String>);
|
||
|
|
impl tracing::field::Visit for FieldVisitor {
|
||
|
|
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||
|
|
if field.name() == "event" {
|
||
|
|
self.0 = Some(value.to_owned());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||
|
|
if field.name() == "event" {
|
||
|
|
self.0 = Some(format!("{value:?}"));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let mut visitor = FieldVisitor(None);
|
||
|
|
event.record(&mut visitor);
|
||
|
|
|
||
|
|
self.events.lock().unwrap().push((target, visitor.0));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ── Test body ───────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
assert!(path.exists(), "missing fixture: {}", path.display());
|
||
|
|
|
||
|
|
let capture = EventCapture::default();
|
||
|
|
let capture_clone = capture.clone();
|
||
|
|
|
||
|
|
// Enable DEBUG so the tracing event would be visible if the function ran.
|
||
|
|
let filter = EnvFilter::new("debug");
|
||
|
|
let subscriber = tracing_subscriber::registry().with(filter).with(capture_clone);
|
||
|
|
|
||
|
|
// Wrap the runtime inside with_default so all spans/events are recorded.
|
||
|
|
let result = tracing::subscriber::with_default(subscriber, || {
|
||
|
|
let config = ExtractionConfig::default(); // extract_images defaults to false
|
||
|
|
tokio::runtime::Builder::new_current_thread()
|
||
|
|
.enable_all()
|
||
|
|
.build()
|
||
|
|
.unwrap()
|
||
|
|
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
|
||
|
|
.expect("extraction must succeed")
|
||
|
|
});
|
||
|
|
|
||
|
|
// Output assertion: no image data returned.
|
||
|
|
assert!(
|
||
|
|
result.images.as_ref().is_none_or(|v| v.is_empty()),
|
||
|
|
"images must be absent when extract_images=false"
|
||
|
|
);
|
||
|
|
|
||
|
|
// Trace assertion: the decompression_started event must not have fired.
|
||
|
|
let events = capture.events.lock().unwrap();
|
||
|
|
let decompression_events: Vec<_> = events
|
||
|
|
.iter()
|
||
|
|
.filter(|(target, event_field)| {
|
||
|
|
target == "kreuzberg::pdf::oxide::images" && event_field.as_deref() == Some("decompression_started")
|
||
|
|
})
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
decompression_events.is_empty(),
|
||
|
|
"extract_images_with_data must not be entered when extract_images=false; \
|
||
|
|
got {} decompression_started event(s)",
|
||
|
|
decompression_events.len()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── ocr_inline_images decompression path ─────────────────────────────────────
|
||
|
|
//
|
||
|
|
// When `ocr_inline_images=true`, the extraction branch condition
|
||
|
|
// `images_extraction_enabled || ocr_inline_images` is true regardless of
|
||
|
|
// `extract_images`. Images are decompressed and stored in `result.images` even
|
||
|
|
// when `ImageExtractionConfig.extract_images = false`. Without this test, a
|
||
|
|
// regression that short-circuits the extraction when `images_extraction_enabled`
|
||
|
|
// is false would go undetected.
|
||
|
|
//
|
||
|
|
// Note: unbounded decompression when `ocr_inline_images=true` and
|
||
|
|
// `config.images=None` (no cap) is a known limitation tracked separately in
|
||
|
|
// kreuzberg#989. Set `config.images.max_images_per_page` to apply a cap.
|
||
|
|
|
||
|
|
/// When `ocr_inline_images=true` and `extract_images=false`, images must still
|
||
|
|
/// be decompressed — `ocr_inline_images` forces entry into the extraction branch.
|
||
|
|
///
|
||
|
|
/// Before the fix for #985 this path was doubly dangerous: the unconditional
|
||
|
|
/// `extract_image_positions` call ran even when `extract_images=false`, and on
|
||
|
|
/// the oxide path the decompression was unbounded. The OCR path was never
|
||
|
|
/// covered by a test, so a regression disabling decompression for
|
||
|
|
/// `ocr_inline_images=true` would be invisible.
|
||
|
|
#[test]
|
||
|
|
fn test_ocr_inline_images_enters_decompression_path() {
|
||
|
|
use kreuzberg::PdfConfig;
|
||
|
|
use kreuzberg::core::config::ImageExtractionConfig;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
assert!(path.exists(), "missing fixture: {}", path.display());
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
// Explicitly disable extract_images — images_extraction_enabled will be false.
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
// Enable ocr_inline_images — this must force entry into the extraction branch.
|
||
|
|
pdf_options: Some(PdfConfig {
|
||
|
|
ocr_inline_images: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
|
||
|
|
|
||
|
|
// Images must be decompressed even though extract_images=false, because
|
||
|
|
// ocr_inline_images=true enters the extraction branch regardless.
|
||
|
|
let images = result.images.as_ref().expect(
|
||
|
|
"result.images must be Some when ocr_inline_images=true, \
|
||
|
|
even if extract_images=false — the extraction branch must be entered",
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!images.is_empty(),
|
||
|
|
"embedded_images_tables.pdf has embedded images; result.images must be non-empty \
|
||
|
|
when ocr_inline_images=true forces entry into the decompression branch"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── include_page_rasters integration tests ──────────────────────────────────
|
||
|
|
//
|
||
|
|
// These tests exercise the full pipeline: ExtractionConfig with
|
||
|
|
// include_page_rasters=true → PDF OCR via Tesseract (per-page rendering) →
|
||
|
|
// merge/reindex in mod.rs → ExtractionResult.images contains PageRaster entries.
|
||
|
|
//
|
||
|
|
// They are the minimum proof that build_page_raster_image is actually called and
|
||
|
|
// that the result survives the merge/reindex at mod.rs:501-507. The unit tests
|
||
|
|
// for build_page_raster_image in ocr.rs verify the helper itself; these tests
|
||
|
|
// verify the integration path.
|
||
|
|
|
||
|
|
/// Enabling `include_page_rasters` on a PDF with `force_ocr=true` must produce
|
||
|
|
/// `ImageKind::PageRaster` entries in `ExtractionResult.images`.
|
||
|
|
///
|
||
|
|
/// Verifies:
|
||
|
|
/// - At least one `PageRaster` entry is present (per-page rendering ran).
|
||
|
|
/// - Every raster has `page_number = Some(N)` where N >= 1 (1-based assignment).
|
||
|
|
/// - Every raster has non-empty `data` (actual PNG bytes were captured).
|
||
|
|
/// - `image_index` values are unique across the full result set (reindex in
|
||
|
|
/// mod.rs:501-507 did not produce collisions).
|
||
|
|
/// - No `page_rasters` processing warning (Tesseract uses per-page path, not bypass).
|
||
|
|
#[cfg(feature = "ocr")]
|
||
|
|
#[test]
|
||
|
|
fn test_include_page_rasters_produces_rasters_on_force_ocr_pdf() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
use kreuzberg::types::ImageKind;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/fake_memo.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
include_page_rasters: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed");
|
||
|
|
|
||
|
|
let images = result
|
||
|
|
.images
|
||
|
|
.as_ref()
|
||
|
|
.expect("images must be Some when include_page_rasters=true");
|
||
|
|
|
||
|
|
let rasters: Vec<_> = images
|
||
|
|
.iter()
|
||
|
|
.filter(|img| img.image_kind == Some(ImageKind::PageRaster))
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!rasters.is_empty(),
|
||
|
|
"include_page_rasters=true must produce at least one PageRaster entry; \
|
||
|
|
got {} total images but none with PageRaster kind",
|
||
|
|
images.len()
|
||
|
|
);
|
||
|
|
|
||
|
|
for raster in &rasters {
|
||
|
|
assert!(
|
||
|
|
raster.page_number.is_some(),
|
||
|
|
"PageRaster at image_index={} must have page_number set",
|
||
|
|
raster.image_index
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
raster.page_number.unwrap() >= 1,
|
||
|
|
"PageRaster page_number must be >= 1 (1-based), got {}",
|
||
|
|
raster.page_number.unwrap()
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!raster.data.is_empty(),
|
||
|
|
"PageRaster at image_index={} must have non-empty PNG data",
|
||
|
|
raster.image_index
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// image_index values must be unique — reindex in mod.rs:501-507 must not collide.
|
||
|
|
let mut seen = std::collections::HashSet::new();
|
||
|
|
for img in images {
|
||
|
|
assert!(
|
||
|
|
seen.insert(img.image_index),
|
||
|
|
"image_index {} appears more than once — reindex produced duplicates",
|
||
|
|
img.image_index
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// No page_rasters warning: Tesseract processes per-page, so the bypass never fires.
|
||
|
|
let raster_warnings: Vec<_> = result
|
||
|
|
.processing_warnings
|
||
|
|
.iter()
|
||
|
|
.filter(|w| w.source.as_ref() == "page_rasters")
|
||
|
|
.collect();
|
||
|
|
assert!(
|
||
|
|
raster_warnings.is_empty(),
|
||
|
|
"no page_rasters warning expected for Tesseract per-page OCR; got: {:?}",
|
||
|
|
raster_warnings.iter().map(|w| w.message.as_ref()).collect::<Vec<_>>()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `include_page_rasters=false` (the default) must not produce any `PageRaster`
|
||
|
|
/// entries even when `force_ocr=true` triggers per-page rendering.
|
||
|
|
///
|
||
|
|
/// Regression guard: the raster capture is conditional on the config flag;
|
||
|
|
/// removing that condition would cause this to fail.
|
||
|
|
#[cfg(feature = "ocr")]
|
||
|
|
#[test]
|
||
|
|
fn test_include_page_rasters_false_does_not_capture_rasters() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
use kreuzberg::types::ImageKind;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/fake_memo.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
include_page_rasters: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed");
|
||
|
|
|
||
|
|
let raster_count = result
|
||
|
|
.images
|
||
|
|
.as_ref()
|
||
|
|
.map(|imgs| {
|
||
|
|
imgs.iter()
|
||
|
|
.filter(|i| i.image_kind == Some(ImageKind::PageRaster))
|
||
|
|
.count()
|
||
|
|
})
|
||
|
|
.unwrap_or(0);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
raster_count, 0,
|
||
|
|
"include_page_rasters=false must not produce any PageRaster images; got {raster_count}"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `force_ocr_pages` path through `extract_mixed_ocr_native` must also produce
|
||
|
|
/// `PageRaster` entries when `include_page_rasters=true`.
|
||
|
|
///
|
||
|
|
/// This exercises a different code path than `force_ocr=true`: the mixed-OCR
|
||
|
|
/// path in `extract_mixed_ocr_native` (ocr.rs) encodes only the selected pages,
|
||
|
|
/// not all pages. Verifies that the per-page raster capture in that path works
|
||
|
|
/// end-to-end and produces correctly numbered entries.
|
||
|
|
#[cfg(feature = "ocr")]
|
||
|
|
#[test]
|
||
|
|
fn test_include_page_rasters_on_force_ocr_pages_path() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
use kreuzberg::types::ImageKind;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/fake_memo.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr_pages: Some(vec![1]),
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
include_page_rasters: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&path, None, &config).expect("force_ocr_pages extraction must succeed");
|
||
|
|
|
||
|
|
let images = result
|
||
|
|
.images
|
||
|
|
.as_ref()
|
||
|
|
.expect("images must be Some when include_page_rasters=true");
|
||
|
|
|
||
|
|
let rasters: Vec<_> = images
|
||
|
|
.iter()
|
||
|
|
.filter(|img| img.image_kind == Some(ImageKind::PageRaster))
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!rasters.is_empty(),
|
||
|
|
"include_page_rasters=true on force_ocr_pages=[1] must produce at least one PageRaster; \
|
||
|
|
got {} total images but none with PageRaster kind",
|
||
|
|
images.len()
|
||
|
|
);
|
||
|
|
|
||
|
|
// Only page 1 was selected — all rasters must reference page 1.
|
||
|
|
for raster in &rasters {
|
||
|
|
assert_eq!(
|
||
|
|
raster.page_number,
|
||
|
|
Some(1),
|
||
|
|
"force_ocr_pages=[1] rasters must all be page_number=1; got {:?}",
|
||
|
|
raster.page_number
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
!raster.data.is_empty(),
|
||
|
|
"PageRaster at image_index={} must have non-empty PNG data",
|
||
|
|
raster.image_index
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// image_index values must be unique across the full result set.
|
||
|
|
let mut seen = std::collections::HashSet::new();
|
||
|
|
for img in images {
|
||
|
|
assert!(
|
||
|
|
seen.insert(img.image_index),
|
||
|
|
"image_index {} appears more than once — reindex produced duplicates",
|
||
|
|
img.image_index
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// When `force_ocr_pages` contains only page numbers that are out of range (e.g.,
|
||
|
|
/// page 99 on a 1-page PDF), `extract_mixed_ocr_native` returns `None` for rasters
|
||
|
|
/// because `render_selected_pages_for_ocr` produces an empty list. This is NOT a
|
||
|
|
/// document-level bypass, so no `ProcessingWarning` with `source = "page_rasters"`
|
||
|
|
/// should be emitted even when `include_page_rasters=true`.
|
||
|
|
#[cfg(feature = "ocr")]
|
||
|
|
#[test]
|
||
|
|
fn test_include_page_rasters_no_warning_on_out_of_range_pages() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::extract_file_sync;
|
||
|
|
|
||
|
|
let path = test_documents_dir().join("pdf/fake_memo.pdf");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "tesseract".to_string(),
|
||
|
|
language: "eng".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr_pages: Some(vec![99]),
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
include_page_rasters: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let result = extract_file_sync(&path, None, &config).expect("out-of-range force_ocr_pages must not error");
|
||
|
|
|
||
|
|
let raster_warning = result
|
||
|
|
.processing_warnings
|
||
|
|
.iter()
|
||
|
|
.find(|w| w.source.as_ref() == "page_rasters");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
raster_warning.is_none(),
|
||
|
|
"force_ocr_pages with all-out-of-range pages must not emit a page_rasters warning \
|
||
|
|
(no document-level bypass occurred); got: {:?}",
|
||
|
|
result
|
||
|
|
.processing_warnings
|
||
|
|
.iter()
|
||
|
|
.map(|w| (w.source.as_ref(), w.message.as_ref()))
|
||
|
|
.collect::<Vec<_>>()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// When `include_page_rasters=true` but the active OCR backend uses document-level
|
||
|
|
/// processing (bypassing per-page rendering), a `ProcessingWarning` with
|
||
|
|
/// `source = "page_rasters"` must be emitted.
|
||
|
|
///
|
||
|
|
/// This is the only scenario where `None` rasters flow through the `force_ocr`
|
||
|
|
/// path while `used_ocr=true`. Exercises the warning guard in
|
||
|
|
/// `PdfExtractor::extract` (mod.rs).
|
||
|
|
///
|
||
|
|
/// A mock backend with `supports_document_processing() = true` is registered so
|
||
|
|
/// that `extract_with_ocr` takes the document-level bypass instead of per-page
|
||
|
|
/// rendering. The mock returns an empty result — enough to trigger the warning.
|
||
|
|
#[cfg(feature = "ocr")]
|
||
|
|
#[test]
|
||
|
|
fn test_include_page_rasters_emits_warning_on_document_level_ocr_bypass() {
|
||
|
|
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
|
||
|
|
use kreuzberg::core::extractor::extract_file;
|
||
|
|
use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin};
|
||
|
|
use kreuzberg::types::ExtractionResult;
|
||
|
|
use std::path::Path;
|
||
|
|
use std::sync::Arc;
|
||
|
|
|
||
|
|
let pdf_path = test_documents_dir().join("pdf/fake_memo.pdf");
|
||
|
|
if !pdf_path.exists() {
|
||
|
|
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
struct DocLevelMock;
|
||
|
|
|
||
|
|
#[async_trait::async_trait]
|
||
|
|
impl OcrBackend for DocLevelMock {
|
||
|
|
fn backend_type(&self) -> OcrBackendType {
|
||
|
|
OcrBackendType::Custom
|
||
|
|
}
|
||
|
|
fn supports_language(&self, _: &str) -> bool {
|
||
|
|
true
|
||
|
|
}
|
||
|
|
async fn process_image(&self, _: &[u8], _: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
|
||
|
|
panic!("process_image must not be called for a document-level backend")
|
||
|
|
}
|
||
|
|
fn supports_document_processing(&self) -> bool {
|
||
|
|
true
|
||
|
|
}
|
||
|
|
async fn process_document(&self, _: &Path, _: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
|
||
|
|
Ok(ExtractionResult::default())
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl Plugin for DocLevelMock {
|
||
|
|
fn name(&self) -> &str {
|
||
|
|
"doc-level-mock-raster-warn"
|
||
|
|
}
|
||
|
|
fn version(&self) -> String {
|
||
|
|
"1.0.0".to_string()
|
||
|
|
}
|
||
|
|
fn initialize(&self) -> kreuzberg::Result<()> {
|
||
|
|
Ok(())
|
||
|
|
}
|
||
|
|
fn shutdown(&self) -> kreuzberg::Result<()> {
|
||
|
|
Ok(())
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
kreuzberg::plugins::register_ocr_backend(Arc::new(DocLevelMock)).unwrap();
|
||
|
|
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
ocr: Some(OcrConfig {
|
||
|
|
backend: "doc-level-mock-raster-warn".to_string(),
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
force_ocr: true,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
include_page_rasters: true,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt
|
||
|
|
.block_on(extract_file(&pdf_path, None, &config))
|
||
|
|
.expect("document-level OCR mock must succeed");
|
||
|
|
|
||
|
|
kreuzberg::plugins::unregister_ocr_backend("doc-level-mock-raster-warn").unwrap();
|
||
|
|
|
||
|
|
let raster_warning = result
|
||
|
|
.processing_warnings
|
||
|
|
.iter()
|
||
|
|
.find(|w| w.source.as_ref() == "page_rasters");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
raster_warning.is_some(),
|
||
|
|
"expected a page_rasters ProcessingWarning when include_page_rasters=true \
|
||
|
|
and OCR backend uses document-level processing; got warnings: {:?}",
|
||
|
|
result
|
||
|
|
.processing_warnings
|
||
|
|
.iter()
|
||
|
|
.map(|w| (w.source.as_ref(), w.message.as_ref()))
|
||
|
|
.collect::<Vec<_>>()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// `image_indices` on chunks must be empty when image extraction is disabled.
|
||
|
|
#[cfg(feature = "chunking")]
|
||
|
|
#[test]
|
||
|
|
fn test_chunk_image_indices_empty_when_images_disabled() {
|
||
|
|
use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig};
|
||
|
|
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
|
||
|
|
let config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
extract_images: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
chunking: Some(ChunkingConfig {
|
||
|
|
max_characters: 500,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let rt = tokio::runtime::Runtime::new().unwrap();
|
||
|
|
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
|
||
|
|
|
||
|
|
if let Some(chunks) = result.chunks.as_ref() {
|
||
|
|
for chunk in chunks {
|
||
|
|
assert!(
|
||
|
|
chunk.metadata.image_indices.is_empty(),
|
||
|
|
"chunk must have no image_indices when extract_images=false"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|