Files
fil/crates/kreuzberg/tests/pdf_image_extraction_tests.rs

1317 lines
50 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Regression tests for PDF image extraction in markdown output.
//!
//! Verifies that embedded images in PDFs produce proper `![](image_N.fmt)`
//! references instead of empty `![]()` placeholders.
#![cfg(feature = "pdf")]
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::core::extractor::extract_file;
use std::path::PathBuf;
mod helpers;
fn test_documents_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.join("test_documents")
}
fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult {
use kreuzberg::core::config::ImageExtractionConfig;
let path = test_documents_dir().join(relative_path);
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
rt.block_on(extract_file(&path, None, &config)).unwrap()
}
#[test]
fn test_multipage_marketing_no_empty_image_refs() {
let result = extract_markdown("pdf/multipage_marketing.pdf");
let content = &result.content;
// Must not contain empty image references
assert!(
!content.contains("![]()"),
"Markdown output must not contain empty image references ![](), got:\n{}",
content
);
}
#[test]
fn test_multipage_marketing_has_image_refs() {
let result = extract_markdown("pdf/multipage_marketing.pdf");
let content = &result.content;
// Must contain at least one proper image reference
assert!(
content.contains("![](image_"),
"Markdown output must contain image references like ![](image_N.png), got:\n{}",
content
);
}
#[test]
fn test_multipage_marketing_images_populated() {
let result = extract_markdown("pdf/multipage_marketing.pdf");
// Extraction result must have images with actual data
let images = result.images.as_ref().expect("images field must be Some");
assert!(!images.is_empty(), "Extraction result must contain extracted images");
// At least some images should have non-empty data
let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count();
assert!(
images_with_data > 0,
"At least some images should have actual pixel data, got {} images total but none with data",
images.len()
);
}
#[test]
fn test_docling_no_empty_image_refs() {
let result = extract_markdown("pdf/docling.pdf");
let content = &result.content;
assert!(
!content.contains("![]()"),
"Docling markdown must not contain empty image references ![](), got:\n{}",
content
);
}
#[test]
fn test_docling_has_image_refs() {
let result = extract_markdown("pdf/docling.pdf");
let content = &result.content;
// Docling has at least 1 figure
assert!(
content.contains("![](image_"),
"Docling markdown must contain image references, got:\n{}",
content
);
}
#[test]
fn test_docling_content_quality() {
let result = extract_markdown("pdf/docling.pdf");
let content = &result.content;
// Verify key content from the Docling technical report is present
assert!(content.contains("Docling"), "Must contain 'Docling'");
assert!(content.contains("PDF"), "Must contain 'PDF'");
assert!(
content.contains("table structure recognition") || content.contains("TableFormer"),
"Must mention table structure recognition or TableFormer"
);
}
/// Regression test for issue #752: structured output was ~1000x slower than text
/// on Ghostscript-produced PDFs with many inline images (~1,924 per page).
///
/// Root cause: `populate_images_from_oxide` used `Vec::contains` (O(N)) inside
/// the per-page object loop — O(N²) total. Fixed by converting to `AHashSet` for
/// O(1) lookup before the loop.
///
/// This test skips when the repro file is absent (it is not committed to the
/// repository due to size). To reproduce locally, generate a Ghostscript vector
/// decomposition PDF and place it at:
/// test_documents/pdf/ghostscript_inline_images_repro.pdf
#[test]
fn test_ghostscript_inline_images_completes_in_reasonable_time() {
let path = test_documents_dir().join("pdf/ghostscript_inline_images_repro.pdf");
if !path.exists() {
eprintln!("SKIP: test_documents/pdf/ghostscript_inline_images_repro.pdf not present");
return;
}
let config = kreuzberg::core::config::ExtractionConfig {
output_format: kreuzberg::core::config::OutputFormat::Markdown,
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let start = std::time::Instant::now();
let result = rt
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.expect("extraction must succeed for Ghostscript inline-image PDF");
let elapsed = start.elapsed();
// Before the fix, a single-page PDF with ~1,924 inline images took ~56 seconds.
// After the fix it should complete in well under 10 seconds even on slow CI.
assert!(
elapsed.as_secs() < 10,
"Ghostscript inline-image PDF must extract in under 10 seconds, took {elapsed:?}"
);
// The file has no text — content may be empty or minimal; that is expected.
let _ = result;
}
// ─── Regression tests for issue #796 ────────────────────────────────────────
//
// Before the fix, setting `images.extract_images = false` (or
// `pdf_options.extract_images = false`) still caused full base64 image data to
// appear in `ExtractionResult.images` when `output_format` was `Markdown` or
// `Djot`. The root cause was that `inject_placeholders` in `extraction.rs`
// defaulted to `true` without checking `extract_images`, allowing the structure
// pipeline to call `populate_images_from_oxide` unconditionally.
/// Helper: extract with a specific output format and images explicitly disabled
/// via `ImageExtractionConfig.extract_images = false`.
fn extract_no_images(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult {
use kreuzberg::core::config::ImageExtractionConfig;
let path = test_documents_dir().join(relative_path);
let config = ExtractionConfig {
output_format: fmt,
images: Some(ImageExtractionConfig {
extract_images: false,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.unwrap()
}
/// Helper: extract with a specific output format and images disabled via
/// `PdfConfig.extract_images = false`.
fn extract_no_images_via_pdf_options(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult {
use kreuzberg::core::config::pdf::PdfConfig;
let path = test_documents_dir().join(relative_path);
let config = ExtractionConfig {
output_format: fmt,
pdf_options: Some(PdfConfig {
extract_images: false,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.unwrap()
}
/// Regression #796: images must be absent when extract_images=false, output_format=Markdown.
///
/// Uses `embedded_images_tables.pdf` — a known-image PDF. Before the fix, this
/// returned `ExtractionResult.images` with full base64 data despite the flag.
#[test]
fn test_regression_796_markdown_no_images_when_disabled_via_images_config() {
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
assert!(
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
"images.extract_images=false must produce an empty images list even for \
output_format=Markdown. Got {} image(s).",
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
);
// Confirm the text content was still extracted (no regression on content).
assert!(
!result.content.is_empty(),
"Content must still be extracted when images are disabled"
);
}
/// Regression #796: same assertion for Djot output format.
#[test]
fn test_regression_796_djot_no_images_when_disabled_via_images_config() {
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
assert!(
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
"images.extract_images=false must produce an empty images list even for \
output_format=Djot. Got {} image(s).",
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
);
}
/// Regression #796: the pdf_options.extract_images path must also be respected
/// when output_format=Markdown.
#[test]
fn test_regression_796_markdown_no_images_when_disabled_via_pdf_options() {
let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
assert!(
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
"pdf_options.extract_images=false must produce an empty images list even for \
output_format=Markdown. Got {} image(s).",
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
);
}
/// Sanity check: images must still appear when extract_images=true (no regression).
#[test]
fn test_regression_796_markdown_images_present_when_enabled() {
use kreuzberg::core::config::ImageExtractionConfig;
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.unwrap();
let images = result
.images
.as_ref()
.expect("images must be Some when extract_images=true");
assert!(
!images.is_empty(),
"images list must be non-empty when extract_images=true and the PDF contains images"
);
}
/// Plain-text baseline: images must never appear for plain output (already passing
/// before the fix; kept as a safety net).
#[test]
fn test_regression_796_plain_no_images_when_disabled() {
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Plain);
assert!(
result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true),
"Plain output with extract_images=false must have no images. Got {} image(s).",
result.images.as_ref().map(|v| v.len()).unwrap_or(0)
);
}
// ─── Content-level image suppression tests ───────────────────────────────────
//
// The earlier #796 tests only assert `result.images.is_empty()`. That field is
// gated separately (extraction.rs:112) and is always empty when
// `extract_images=false`, even if the `inject_placeholders` guard at line 216 is
// removed. The guard controls whether `ElementKind::Image` elements are injected
// into the InternalDocument — which in turn controls whether image placeholder
// references (`![]()` / `![](image_N.fmt)`) appear in `result.content`.
//
// The Djot renderer (`djot.rs`) lacked the `doc.images.get()` None check that
// comrak_bridge, html_styled, and plain all have. Removing the guard would cause
// `![]()` to leak into Djot content with no test catching it.
//
// JSON renderer gap (out of scope): json.rs emits `{"type":"image","alt":null,"src":null}`
// for orphaned elements — null fields are valid structured JSON and produce no broken
// markup, so it is intentionally not addressed here.
/// Djot content must not contain image markup when `extract_images=false`.
///
/// End-to-end contract test: requires both the `inject_placeholders` guard in
/// `extraction.rs` AND the Djot renderer's `None` guard to be absent before it
/// fails. The renderer-level unit test `test_djot_renderer_skips_orphaned_image_element`
/// in `djot.rs` is the minimal proof that the renderer fix works independently.
#[test]
fn test_djot_content_has_no_image_refs_when_extraction_disabled() {
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
assert!(
!result.content.contains("![]()"),
"Djot output must not contain empty ![]() refs when extract_images=false.\n\
Got content:\n{}",
&result.content[..result.content.len().min(400)]
);
assert!(
!result.content.contains("![](image_"),
"Djot output must not contain image placeholder refs when extract_images=false.\n\
Got content:\n{}",
&result.content[..result.content.len().min(400)]
);
// Text content must still be present — no regression on extraction.
assert!(
!result.content.is_empty(),
"Djot content must not be empty when images are disabled"
);
}
/// Markdown content must not contain image markup when `extract_images=false`.
///
/// comrak_bridge already has a None guard so this would pass even without the
/// extraction-level guard, but it pins the end-to-end contract explicitly.
#[test]
fn test_markdown_content_has_no_image_refs_when_extraction_disabled() {
let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown);
assert!(
!result.content.contains("![]()"),
"Markdown output must not contain empty ![]() refs when extract_images=false.\n\
Got content:\n{}",
&result.content[..result.content.len().min(400)]
);
assert!(
!result.content.contains("![](image_"),
"Markdown output must not contain image placeholder refs when extract_images=false.\n\
Got content:\n{}",
&result.content[..result.content.len().min(400)]
);
}
/// Djot content must not contain image markup when disabled via `pdf_options.extract_images`.
///
/// Verifies both config paths are covered — mirrors the existing `result.images`
/// test for the pdf_options path.
#[test]
fn test_djot_content_has_no_image_refs_when_disabled_via_pdf_options() {
let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Djot);
assert!(
!result.content.contains("![]()"),
"Djot output (pdf_options path) must not contain ![]() when extract_images=false"
);
assert!(
!result.content.contains("![](image_"),
"Djot output (pdf_options path) must not contain image refs when extract_images=false"
);
}
// ─── Page-level and chunk-level image index references ────────────────────────
//
// Pages carry `image_indices: Vec<usize>` — zero-based indices into the
// top-level `ExtractionResult.images` collection. Chunks carry the same field.
fn extract_with_pages_and_images(relative_path: &str) -> kreuzberg::types::ExtractionResult {
use kreuzberg::core::config::{ImageExtractionConfig, PageConfig};
let path = test_documents_dir().join(relative_path);
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
rt.block_on(extract_file(&path, None, &config)).unwrap()
}
/// Pages that contain images must have non-empty `image_indices` pointing into
/// `ExtractionResult.images`. Every index must be in-bounds.
#[test]
fn test_page_image_indices_are_valid_when_images_extracted() {
let result = extract_with_pages_and_images("pdf/embedded_images_tables.pdf");
let images = result.images.as_ref().expect("images must be Some");
assert!(!images.is_empty(), "fixture must have extracted images");
let pages = result
.pages
.as_ref()
.expect("pages must be Some when extract_pages=true");
assert!(!pages.is_empty(), "fixture must have pages");
// At least one page must carry image_indices (not all pages need images).
let pages_with_images: Vec<_> = pages.iter().filter(|p| !p.image_indices.is_empty()).collect();
assert!(
!pages_with_images.is_empty(),
"at least one page must have image_indices populated when the PDF contains images"
);
// Every index must be in-bounds and the referenced image must report
// belonging to this page (cross-validation: wrong-page bugs would pass a
// bounds-only check).
for page in pages {
for &idx in &page.image_indices {
assert!(
(idx as usize) < images.len(),
"page {} image_indices[{}] = {} is out of bounds (images.len() = {})",
page.page_number,
idx,
idx,
images.len()
);
let img_page = images[idx as usize].page_number;
assert_eq!(
img_page,
Some(page.page_number),
"image at index {} has page_number {:?} but is referenced by page {}",
idx,
img_page,
page.page_number
);
}
}
}
/// `image_indices` on pages must be empty when image extraction is disabled.
#[test]
fn test_page_image_indices_empty_when_images_disabled() {
use kreuzberg::core::config::{ImageExtractionConfig, PageConfig};
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
images: Some(ImageExtractionConfig {
extract_images: false,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
if let Some(pages) = result.pages.as_ref() {
for page in pages {
assert!(
page.image_indices.is_empty(),
"page {} must have no image_indices when extract_images=false",
page.page_number
);
}
}
}
#[cfg(feature = "chunking")]
fn extract_with_pages_images_and_chunks(relative_path: &str) -> kreuzberg::types::ExtractionResult {
use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig, PageConfig};
let path = test_documents_dir().join(relative_path);
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 500,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
rt.block_on(extract_file(&path, None, &config)).unwrap()
}
/// Chunks that span pages containing images must have non-empty `image_indices`.
/// Every index must be in-bounds, and the referenced image's `page_number` must
/// fall within the chunk's `[first_page, last_page]` range.
#[cfg(feature = "chunking")]
#[test]
fn test_chunk_image_indices_are_valid_when_images_extracted() {
let result = extract_with_pages_images_and_chunks("pdf/embedded_images_tables.pdf");
let images = result.images.as_ref().expect("images must be Some");
assert!(!images.is_empty(), "fixture must have extracted images");
let chunks = result
.chunks
.as_ref()
.expect("chunks must be Some when chunking is configured");
assert!(!chunks.is_empty(), "fixture must produce chunks");
// At least one chunk must carry image_indices.
let chunks_with_images: Vec<_> = chunks.iter().filter(|c| !c.metadata.image_indices.is_empty()).collect();
assert!(
!chunks_with_images.is_empty(),
"at least one chunk must have image_indices when the PDF contains images"
);
for chunk in chunks {
for &idx in &chunk.metadata.image_indices {
// In-bounds check.
assert!(
(idx as usize) < images.len(),
"chunk image_indices[{}] = {} is out of bounds (images.len() = {})",
idx,
idx,
images.len()
);
// Cross-validation: referenced image must belong to a page within
// the chunk's page range.
if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) {
let img_page = images[idx as usize]
.page_number
.expect("image referenced by a chunk must have a page_number set");
assert!(
img_page >= first && img_page <= last,
"image at index {} is on page {} but chunk covers pages [{}, {}]",
idx,
img_page,
first,
last
);
}
}
}
}
/// Regression for #985: max_images_per_page must cap the output count per page.
///
/// Before the fix, `extract_image_positions` ran a complete decompression pass
/// over every page unconditionally (even when extract_images=false), then
/// `extract_images_with_data` ran a second pass. The `.take(N)` limit only
/// clipped the returned slice — it did not stop the decompression work.
///
/// After the fix:
/// - When extract_images=false, NO decompression occurs at all (the main hang fix).
/// - When extract_images=true, a single pass runs and the cap is respected in output.
/// The per-page decompression cost for images beyond the cap is a pdf_oxide
/// upstream limitation: `extract_images()` is eager. Eliminating that
/// remaining cost requires a count-limited API upstream.
#[test]
fn test_max_images_per_page_cap_respected_in_output() {
use kreuzberg::core::config::ImageExtractionConfig;
use std::collections::HashMap;
let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf");
if !path.exists() {
eprintln!("skipping: test PDF not present at {}", path.display());
return;
}
let cap: u32 = 5;
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
max_images_per_page: Some(cap),
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt
.block_on(extract_file(&path, None, &config))
.expect("extraction must succeed");
let images = result
.images
.as_ref()
.expect("images must be Some when extract_images=true");
// Cap must be respected per page in the output.
let mut per_page: HashMap<u32, usize> = HashMap::new();
for img in images {
*per_page.entry(img.page_number.unwrap_or(1)).or_default() += 1;
}
for (page, count) in &per_page {
assert!(
*count <= cap as usize,
"page {page} has {count} images; cap={cap} must be respected"
);
}
}
/// Regression for #985 (no-images case): when extract_images=false, no images
/// are returned and the result is consistent with the fix.
///
/// Before the fix, `extract_image_positions` ran unconditionally and triggered
/// a full decompression pass over every image on every page — even when the
/// caller never asked for image data. After the fix the decompression path is
/// skipped entirely when images are not requested.
#[test]
fn test_no_images_returned_when_extraction_disabled_on_dense_pdf() {
let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf");
if !path.exists() {
eprintln!("skipping: test PDF not present at {}", path.display());
return;
}
let config = ExtractionConfig::default(); // extract_images defaults to false
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt
.block_on(extract_file(&path, None, &config))
.expect("extraction must succeed");
// No images should be returned when extraction is disabled.
assert!(
result.images.is_none() || result.images.as_ref().is_some_and(|v| v.is_empty()),
"images must be absent when extract_images=false"
);
}
/// Positions derived from extracted data must be consistent with the Markdown placeholders.
///
/// When inject_placeholders=true, the renderer emits `![](image_N.ext)` links where N
/// is the image_index. Every such N must have a corresponding entry in result.images.
/// Also verifies that image_index values are unique — the derivation loop must not emit
/// duplicate global indices.
#[test]
fn test_image_positions_consistent_with_image_data() {
use kreuzberg::core::config::{ImageExtractionConfig, OutputFormat};
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
images: Some(ImageExtractionConfig {
extract_images: true,
inject_placeholders: true,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
let images = match result.images.as_ref() {
Some(imgs) if !imgs.is_empty() => imgs,
_ => return, // no images in this PDF — nothing to verify
};
// image_index values must be unique across the returned set.
let mut seen = std::collections::HashSet::new();
for img in images {
assert!(
seen.insert(img.image_index),
"image_index {} appears more than once — position derivation emitted duplicates",
img.image_index
);
}
// Every `![](image_N.ext)` placeholder in Markdown must resolve to an index in
// result.images. This would fail if inject_placeholders emitted a reference for
// an image that was never extracted (orphaned placeholder).
let known: std::collections::HashSet<u32> = images.iter().map(|i| i.image_index).collect();
let re = regex::Regex::new(r"!\[\]\(image_(\d+)\.[a-z]+\)").unwrap();
for cap in re.captures_iter(&result.content) {
let idx: u32 = cap[1].parse().unwrap();
assert!(
known.contains(&idx),
"Markdown contains `![](image_{idx}.ext)` but result.images has no entry \
with image_index={idx} orphaned placeholder"
);
}
}
/// Regression for #985 (double-decompression fix): the text-only extraction path must
/// skip `extract_images_with_data` entirely.
///
/// When `extract_images` is `false` (the default), `extraction.rs` must not enter the
/// images branch at all — verified here by confirming that `result.images` is `None`
/// (or empty) and that the call completes without decompressing any image data.
/// This is the minimal structural proof that the guard in `extraction.rs` works:
/// if `extract_images_with_data` were called unconditionally, the result would be
/// `Some(non_empty_vec)` for a PDF that actually contains images.
#[test]
fn test_no_decompression_when_images_disabled() {
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
assert!(path.exists(), "missing fixture: {}", path.display());
// Default config: extract_images defaults to false.
let config = ExtractionConfig::default();
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.expect("extraction must succeed");
// The text-only path must not return any image data.
assert!(
result.images.as_ref().is_none_or(|v| v.is_empty()),
"images must be absent on text-only extraction (extract_images=false). \
Got {} image(s) extract_images_with_data was called when it should not have been.",
result.images.as_ref().map_or(0, |v| v.len())
);
// Text content must still be present — no regression on the extraction itself.
assert!(
!result.content.is_empty(),
"text content must still be extracted when images are disabled"
);
}
/// Trace-span assertion for #985: `extract_images_with_data` must NOT be entered
/// when `extract_images` is false (the default).
///
/// This directly proves the decompression code path was skipped — complementing
/// `test_no_decompression_when_images_disabled` which only observes the output.
/// An event with target `kreuzberg::pdf::oxide::images` and field
/// `event = "decompression_started"` is emitted at the top of
/// `extract_images_with_data`; absence of that event is structural proof the
/// function was not called.
#[test]
fn test_no_decompression_trace_when_images_disabled() {
use std::sync::{Arc, Mutex};
use tracing_subscriber::{EnvFilter, layer::SubscriberExt as _};
// ── Captured-event layer ────────────────────────────────────────────────
#[allow(clippy::type_complexity)]
#[derive(Clone, Default)]
struct EventCapture {
events: Arc<Mutex<Vec<(String, Option<String>)>>>,
}
impl<S> tracing_subscriber::Layer<S> for EventCapture
where
S: tracing::Subscriber,
{
fn on_event(&self, event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>) {
let target = event.metadata().target().to_owned();
// Only record events from our target to avoid unbounded accumulation.
if target != "kreuzberg::pdf::oxide::images" {
return;
}
// Walk the fields to capture the `event` key if present.
struct FieldVisitor(Option<String>);
impl tracing::field::Visit for FieldVisitor {
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
if field.name() == "event" {
self.0 = Some(value.to_owned());
}
}
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
if field.name() == "event" {
self.0 = Some(format!("{value:?}"));
}
}
}
let mut visitor = FieldVisitor(None);
event.record(&mut visitor);
self.events.lock().unwrap().push((target, visitor.0));
}
}
// ── Test body ───────────────────────────────────────────────────────────
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
assert!(path.exists(), "missing fixture: {}", path.display());
let capture = EventCapture::default();
let capture_clone = capture.clone();
// Enable DEBUG so the tracing event would be visible if the function ran.
let filter = EnvFilter::new("debug");
let subscriber = tracing_subscriber::registry().with(filter).with(capture_clone);
// Wrap the runtime inside with_default so all spans/events are recorded.
let result = tracing::subscriber::with_default(subscriber, || {
let config = ExtractionConfig::default(); // extract_images defaults to false
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap()
.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config))
.expect("extraction must succeed")
});
// Output assertion: no image data returned.
assert!(
result.images.as_ref().is_none_or(|v| v.is_empty()),
"images must be absent when extract_images=false"
);
// Trace assertion: the decompression_started event must not have fired.
let events = capture.events.lock().unwrap();
let decompression_events: Vec<_> = events
.iter()
.filter(|(target, event_field)| {
target == "kreuzberg::pdf::oxide::images" && event_field.as_deref() == Some("decompression_started")
})
.collect();
assert!(
decompression_events.is_empty(),
"extract_images_with_data must not be entered when extract_images=false; \
got {} decompression_started event(s)",
decompression_events.len()
);
}
// ─── ocr_inline_images decompression path ─────────────────────────────────────
//
// When `ocr_inline_images=true`, the extraction branch condition
// `images_extraction_enabled || ocr_inline_images` is true regardless of
// `extract_images`. Images are decompressed and stored in `result.images` even
// when `ImageExtractionConfig.extract_images = false`. Without this test, a
// regression that short-circuits the extraction when `images_extraction_enabled`
// is false would go undetected.
//
// Note: unbounded decompression when `ocr_inline_images=true` and
// `config.images=None` (no cap) is a known limitation tracked separately in
// kreuzberg#989. Set `config.images.max_images_per_page` to apply a cap.
/// When `ocr_inline_images=true` and `extract_images=false`, images must still
/// be decompressed — `ocr_inline_images` forces entry into the extraction branch.
///
/// Before the fix for #985 this path was doubly dangerous: the unconditional
/// `extract_image_positions` call ran even when `extract_images=false`, and on
/// the oxide path the decompression was unbounded. The OCR path was never
/// covered by a test, so a regression disabling decompression for
/// `ocr_inline_images=true` would be invisible.
#[test]
fn test_ocr_inline_images_enters_decompression_path() {
use kreuzberg::PdfConfig;
use kreuzberg::core::config::ImageExtractionConfig;
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
assert!(path.exists(), "missing fixture: {}", path.display());
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
// Explicitly disable extract_images — images_extraction_enabled will be false.
images: Some(ImageExtractionConfig {
extract_images: false,
..Default::default()
}),
// Enable ocr_inline_images — this must force entry into the extraction branch.
pdf_options: Some(PdfConfig {
ocr_inline_images: true,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
// Images must be decompressed even though extract_images=false, because
// ocr_inline_images=true enters the extraction branch regardless.
let images = result.images.as_ref().expect(
"result.images must be Some when ocr_inline_images=true, \
even if extract_images=false the extraction branch must be entered",
);
assert!(
!images.is_empty(),
"embedded_images_tables.pdf has embedded images; result.images must be non-empty \
when ocr_inline_images=true forces entry into the decompression branch"
);
}
// ─── include_page_rasters integration tests ──────────────────────────────────
//
// These tests exercise the full pipeline: ExtractionConfig with
// include_page_rasters=true → PDF OCR via Tesseract (per-page rendering) →
// merge/reindex in mod.rs → ExtractionResult.images contains PageRaster entries.
//
// They are the minimum proof that build_page_raster_image is actually called and
// that the result survives the merge/reindex at mod.rs:501-507. The unit tests
// for build_page_raster_image in ocr.rs verify the helper itself; these tests
// verify the integration path.
/// Enabling `include_page_rasters` on a PDF with `force_ocr=true` must produce
/// `ImageKind::PageRaster` entries in `ExtractionResult.images`.
///
/// Verifies:
/// - At least one `PageRaster` entry is present (per-page rendering ran).
/// - Every raster has `page_number = Some(N)` where N >= 1 (1-based assignment).
/// - Every raster has non-empty `data` (actual PNG bytes were captured).
/// - `image_index` values are unique across the full result set (reindex in
/// mod.rs:501-507 did not produce collisions).
/// - No `page_rasters` processing warning (Tesseract uses per-page path, not bypass).
#[cfg(feature = "ocr")]
#[test]
fn test_include_page_rasters_produces_rasters_on_force_ocr_pdf() {
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
use kreuzberg::types::ImageKind;
let path = test_documents_dir().join("pdf/fake_memo.pdf");
if !path.exists() {
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
return;
}
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
images: Some(ImageExtractionConfig {
include_page_rasters: true,
..Default::default()
}),
use_cache: false,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed");
let images = result
.images
.as_ref()
.expect("images must be Some when include_page_rasters=true");
let rasters: Vec<_> = images
.iter()
.filter(|img| img.image_kind == Some(ImageKind::PageRaster))
.collect();
assert!(
!rasters.is_empty(),
"include_page_rasters=true must produce at least one PageRaster entry; \
got {} total images but none with PageRaster kind",
images.len()
);
for raster in &rasters {
assert!(
raster.page_number.is_some(),
"PageRaster at image_index={} must have page_number set",
raster.image_index
);
assert!(
raster.page_number.unwrap() >= 1,
"PageRaster page_number must be >= 1 (1-based), got {}",
raster.page_number.unwrap()
);
assert!(
!raster.data.is_empty(),
"PageRaster at image_index={} must have non-empty PNG data",
raster.image_index
);
}
// image_index values must be unique — reindex in mod.rs:501-507 must not collide.
let mut seen = std::collections::HashSet::new();
for img in images {
assert!(
seen.insert(img.image_index),
"image_index {} appears more than once — reindex produced duplicates",
img.image_index
);
}
// No page_rasters warning: Tesseract processes per-page, so the bypass never fires.
let raster_warnings: Vec<_> = result
.processing_warnings
.iter()
.filter(|w| w.source.as_ref() == "page_rasters")
.collect();
assert!(
raster_warnings.is_empty(),
"no page_rasters warning expected for Tesseract per-page OCR; got: {:?}",
raster_warnings.iter().map(|w| w.message.as_ref()).collect::<Vec<_>>()
);
}
/// `include_page_rasters=false` (the default) must not produce any `PageRaster`
/// entries even when `force_ocr=true` triggers per-page rendering.
///
/// Regression guard: the raster capture is conditional on the config flag;
/// removing that condition would cause this to fail.
#[cfg(feature = "ocr")]
#[test]
fn test_include_page_rasters_false_does_not_capture_rasters() {
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
use kreuzberg::types::ImageKind;
let path = test_documents_dir().join("pdf/fake_memo.pdf");
if !path.exists() {
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
return;
}
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: true,
images: Some(ImageExtractionConfig {
include_page_rasters: false,
..Default::default()
}),
use_cache: false,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed");
let raster_count = result
.images
.as_ref()
.map(|imgs| {
imgs.iter()
.filter(|i| i.image_kind == Some(ImageKind::PageRaster))
.count()
})
.unwrap_or(0);
assert_eq!(
raster_count, 0,
"include_page_rasters=false must not produce any PageRaster images; got {raster_count}"
);
}
/// `force_ocr_pages` path through `extract_mixed_ocr_native` must also produce
/// `PageRaster` entries when `include_page_rasters=true`.
///
/// This exercises a different code path than `force_ocr=true`: the mixed-OCR
/// path in `extract_mixed_ocr_native` (ocr.rs) encodes only the selected pages,
/// not all pages. Verifies that the per-page raster capture in that path works
/// end-to-end and produces correctly numbered entries.
#[cfg(feature = "ocr")]
#[test]
fn test_include_page_rasters_on_force_ocr_pages_path() {
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
use kreuzberg::types::ImageKind;
let path = test_documents_dir().join("pdf/fake_memo.pdf");
if !path.exists() {
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
return;
}
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr_pages: Some(vec![1]),
images: Some(ImageExtractionConfig {
include_page_rasters: true,
..Default::default()
}),
use_cache: false,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("force_ocr_pages extraction must succeed");
let images = result
.images
.as_ref()
.expect("images must be Some when include_page_rasters=true");
let rasters: Vec<_> = images
.iter()
.filter(|img| img.image_kind == Some(ImageKind::PageRaster))
.collect();
assert!(
!rasters.is_empty(),
"include_page_rasters=true on force_ocr_pages=[1] must produce at least one PageRaster; \
got {} total images but none with PageRaster kind",
images.len()
);
// Only page 1 was selected — all rasters must reference page 1.
for raster in &rasters {
assert_eq!(
raster.page_number,
Some(1),
"force_ocr_pages=[1] rasters must all be page_number=1; got {:?}",
raster.page_number
);
assert!(
!raster.data.is_empty(),
"PageRaster at image_index={} must have non-empty PNG data",
raster.image_index
);
}
// image_index values must be unique across the full result set.
let mut seen = std::collections::HashSet::new();
for img in images {
assert!(
seen.insert(img.image_index),
"image_index {} appears more than once — reindex produced duplicates",
img.image_index
);
}
}
/// When `force_ocr_pages` contains only page numbers that are out of range (e.g.,
/// page 99 on a 1-page PDF), `extract_mixed_ocr_native` returns `None` for rasters
/// because `render_selected_pages_for_ocr` produces an empty list. This is NOT a
/// document-level bypass, so no `ProcessingWarning` with `source = "page_rasters"`
/// should be emitted even when `include_page_rasters=true`.
#[cfg(feature = "ocr")]
#[test]
fn test_include_page_rasters_no_warning_on_out_of_range_pages() {
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
let path = test_documents_dir().join("pdf/fake_memo.pdf");
if !path.exists() {
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
return;
}
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr_pages: Some(vec![99]),
images: Some(ImageExtractionConfig {
include_page_rasters: true,
..Default::default()
}),
use_cache: false,
..Default::default()
};
let result = extract_file_sync(&path, None, &config).expect("out-of-range force_ocr_pages must not error");
let raster_warning = result
.processing_warnings
.iter()
.find(|w| w.source.as_ref() == "page_rasters");
assert!(
raster_warning.is_none(),
"force_ocr_pages with all-out-of-range pages must not emit a page_rasters warning \
(no document-level bypass occurred); got: {:?}",
result
.processing_warnings
.iter()
.map(|w| (w.source.as_ref(), w.message.as_ref()))
.collect::<Vec<_>>()
);
}
/// When `include_page_rasters=true` but the active OCR backend uses document-level
/// processing (bypassing per-page rendering), a `ProcessingWarning` with
/// `source = "page_rasters"` must be emitted.
///
/// This is the only scenario where `None` rasters flow through the `force_ocr`
/// path while `used_ocr=true`. Exercises the warning guard in
/// `PdfExtractor::extract` (mod.rs).
///
/// A mock backend with `supports_document_processing() = true` is registered so
/// that `extract_with_ocr` takes the document-level bypass instead of per-page
/// rendering. The mock returns an empty result — enough to trigger the warning.
#[cfg(feature = "ocr")]
#[test]
fn test_include_page_rasters_emits_warning_on_document_level_ocr_bypass() {
use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig};
use kreuzberg::core::extractor::extract_file;
use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin};
use kreuzberg::types::ExtractionResult;
use std::path::Path;
use std::sync::Arc;
let pdf_path = test_documents_dir().join("pdf/fake_memo.pdf");
if !pdf_path.exists() {
eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present");
return;
}
struct DocLevelMock;
#[async_trait::async_trait]
impl OcrBackend for DocLevelMock {
fn backend_type(&self) -> OcrBackendType {
OcrBackendType::Custom
}
fn supports_language(&self, _: &str) -> bool {
true
}
async fn process_image(&self, _: &[u8], _: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
panic!("process_image must not be called for a document-level backend")
}
fn supports_document_processing(&self) -> bool {
true
}
async fn process_document(&self, _: &Path, _: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
Ok(ExtractionResult::default())
}
}
impl Plugin for DocLevelMock {
fn name(&self) -> &str {
"doc-level-mock-raster-warn"
}
fn version(&self) -> String {
"1.0.0".to_string()
}
fn initialize(&self) -> kreuzberg::Result<()> {
Ok(())
}
fn shutdown(&self) -> kreuzberg::Result<()> {
Ok(())
}
}
kreuzberg::plugins::register_ocr_backend(Arc::new(DocLevelMock)).unwrap();
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "doc-level-mock-raster-warn".to_string(),
..Default::default()
}),
force_ocr: true,
images: Some(ImageExtractionConfig {
include_page_rasters: true,
..Default::default()
}),
use_cache: false,
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt
.block_on(extract_file(&pdf_path, None, &config))
.expect("document-level OCR mock must succeed");
kreuzberg::plugins::unregister_ocr_backend("doc-level-mock-raster-warn").unwrap();
let raster_warning = result
.processing_warnings
.iter()
.find(|w| w.source.as_ref() == "page_rasters");
assert!(
raster_warning.is_some(),
"expected a page_rasters ProcessingWarning when include_page_rasters=true \
and OCR backend uses document-level processing; got warnings: {:?}",
result
.processing_warnings
.iter()
.map(|w| (w.source.as_ref(), w.message.as_ref()))
.collect::<Vec<_>>()
);
}
/// `image_indices` on chunks must be empty when image extraction is disabled.
#[cfg(feature = "chunking")]
#[test]
fn test_chunk_image_indices_empty_when_images_disabled() {
use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig};
let path = test_documents_dir().join("pdf/embedded_images_tables.pdf");
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
images: Some(ImageExtractionConfig {
extract_images: false,
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 500,
..Default::default()
}),
..Default::default()
};
let rt = tokio::runtime::Runtime::new().unwrap();
let result = rt.block_on(extract_file(&path, None, &config)).unwrap();
if let Some(chunks) = result.chunks.as_ref() {
for chunk in chunks {
assert!(
chunk.metadata.image_indices.is_empty(),
"chunk must have no image_indices when extract_images=false"
);
}
}
}