//! Regression tests for PDF image extraction in markdown output. //! //! Verifies that embedded images in PDFs produce proper `![](image_N.fmt)` //! references instead of empty `![]()` placeholders. #![cfg(feature = "pdf")] use kreuzberg::core::config::{ExtractionConfig, OutputFormat}; use kreuzberg::core::extractor::extract_file; use std::path::PathBuf; mod helpers; fn test_documents_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .parent() .unwrap() .parent() .unwrap() .join("test_documents") } fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult { use kreuzberg::core::config::ImageExtractionConfig; let path = test_documents_dir().join(relative_path); let config = ExtractionConfig { output_format: OutputFormat::Markdown, images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(extract_file(&path, None, &config)).unwrap() } #[test] fn test_multipage_marketing_no_empty_image_refs() { let result = extract_markdown("pdf/multipage_marketing.pdf"); let content = &result.content; // Must not contain empty image references assert!( !content.contains("![]()"), "Markdown output must not contain empty image references ![](), got:\n{}", content ); } #[test] fn test_multipage_marketing_has_image_refs() { let result = extract_markdown("pdf/multipage_marketing.pdf"); let content = &result.content; // Must contain at least one proper image reference assert!( content.contains("![](image_"), "Markdown output must contain image references like ![](image_N.png), got:\n{}", content ); } #[test] fn test_multipage_marketing_images_populated() { let result = extract_markdown("pdf/multipage_marketing.pdf"); // Extraction result must have images with actual data let images = result.images.as_ref().expect("images field must be Some"); assert!(!images.is_empty(), "Extraction result must contain extracted images"); // At least some images should have non-empty data let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count(); assert!( images_with_data > 0, "At least some images should have actual pixel data, got {} images total but none with data", images.len() ); } #[test] fn test_docling_no_empty_image_refs() { let result = extract_markdown("pdf/docling.pdf"); let content = &result.content; assert!( !content.contains("![]()"), "Docling markdown must not contain empty image references ![](), got:\n{}", content ); } #[test] fn test_docling_has_image_refs() { let result = extract_markdown("pdf/docling.pdf"); let content = &result.content; // Docling has at least 1 figure assert!( content.contains("![](image_"), "Docling markdown must contain image references, got:\n{}", content ); } #[test] fn test_docling_content_quality() { let result = extract_markdown("pdf/docling.pdf"); let content = &result.content; // Verify key content from the Docling technical report is present assert!(content.contains("Docling"), "Must contain 'Docling'"); assert!(content.contains("PDF"), "Must contain 'PDF'"); assert!( content.contains("table structure recognition") || content.contains("TableFormer"), "Must mention table structure recognition or TableFormer" ); } /// Regression test for issue #752: structured output was ~1000x slower than text /// on Ghostscript-produced PDFs with many inline images (~1,924 per page). /// /// Root cause: `populate_images_from_oxide` used `Vec::contains` (O(N)) inside /// the per-page object loop — O(N²) total. Fixed by converting to `AHashSet` for /// O(1) lookup before the loop. /// /// This test skips when the repro file is absent (it is not committed to the /// repository due to size). To reproduce locally, generate a Ghostscript vector /// decomposition PDF and place it at: /// test_documents/pdf/ghostscript_inline_images_repro.pdf #[test] fn test_ghostscript_inline_images_completes_in_reasonable_time() { let path = test_documents_dir().join("pdf/ghostscript_inline_images_repro.pdf"); if !path.exists() { eprintln!("SKIP: test_documents/pdf/ghostscript_inline_images_repro.pdf not present"); return; } let config = kreuzberg::core::config::ExtractionConfig { output_format: kreuzberg::core::config::OutputFormat::Markdown, ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let start = std::time::Instant::now(); let result = rt .block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .expect("extraction must succeed for Ghostscript inline-image PDF"); let elapsed = start.elapsed(); // Before the fix, a single-page PDF with ~1,924 inline images took ~56 seconds. // After the fix it should complete in well under 10 seconds even on slow CI. assert!( elapsed.as_secs() < 10, "Ghostscript inline-image PDF must extract in under 10 seconds, took {elapsed:?}" ); // The file has no text — content may be empty or minimal; that is expected. let _ = result; } // ─── Regression tests for issue #796 ──────────────────────────────────────── // // Before the fix, setting `images.extract_images = false` (or // `pdf_options.extract_images = false`) still caused full base64 image data to // appear in `ExtractionResult.images` when `output_format` was `Markdown` or // `Djot`. The root cause was that `inject_placeholders` in `extraction.rs` // defaulted to `true` without checking `extract_images`, allowing the structure // pipeline to call `populate_images_from_oxide` unconditionally. /// Helper: extract with a specific output format and images explicitly disabled /// via `ImageExtractionConfig.extract_images = false`. fn extract_no_images(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult { use kreuzberg::core::config::ImageExtractionConfig; let path = test_documents_dir().join(relative_path); let config = ExtractionConfig { output_format: fmt, images: Some(ImageExtractionConfig { extract_images: false, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .unwrap() } /// Helper: extract with a specific output format and images disabled via /// `PdfConfig.extract_images = false`. fn extract_no_images_via_pdf_options(relative_path: &str, fmt: OutputFormat) -> kreuzberg::types::ExtractionResult { use kreuzberg::core::config::pdf::PdfConfig; let path = test_documents_dir().join(relative_path); let config = ExtractionConfig { output_format: fmt, pdf_options: Some(PdfConfig { extract_images: false, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .unwrap() } /// Regression #796: images must be absent when extract_images=false, output_format=Markdown. /// /// Uses `embedded_images_tables.pdf` — a known-image PDF. Before the fix, this /// returned `ExtractionResult.images` with full base64 data despite the flag. #[test] fn test_regression_796_markdown_no_images_when_disabled_via_images_config() { let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown); assert!( result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true), "images.extract_images=false must produce an empty images list even for \ output_format=Markdown. Got {} image(s).", result.images.as_ref().map(|v| v.len()).unwrap_or(0) ); // Confirm the text content was still extracted (no regression on content). assert!( !result.content.is_empty(), "Content must still be extracted when images are disabled" ); } /// Regression #796: same assertion for Djot output format. #[test] fn test_regression_796_djot_no_images_when_disabled_via_images_config() { let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot); assert!( result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true), "images.extract_images=false must produce an empty images list even for \ output_format=Djot. Got {} image(s).", result.images.as_ref().map(|v| v.len()).unwrap_or(0) ); } /// Regression #796: the pdf_options.extract_images path must also be respected /// when output_format=Markdown. #[test] fn test_regression_796_markdown_no_images_when_disabled_via_pdf_options() { let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Markdown); assert!( result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true), "pdf_options.extract_images=false must produce an empty images list even for \ output_format=Markdown. Got {} image(s).", result.images.as_ref().map(|v| v.len()).unwrap_or(0) ); } /// Sanity check: images must still appear when extract_images=true (no regression). #[test] fn test_regression_796_markdown_images_present_when_enabled() { use kreuzberg::core::config::ImageExtractionConfig; let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt .block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .unwrap(); let images = result .images .as_ref() .expect("images must be Some when extract_images=true"); assert!( !images.is_empty(), "images list must be non-empty when extract_images=true and the PDF contains images" ); } /// Plain-text baseline: images must never appear for plain output (already passing /// before the fix; kept as a safety net). #[test] fn test_regression_796_plain_no_images_when_disabled() { let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Plain); assert!( result.images.as_ref().map(|v| v.is_empty()).unwrap_or(true), "Plain output with extract_images=false must have no images. Got {} image(s).", result.images.as_ref().map(|v| v.len()).unwrap_or(0) ); } // ─── Content-level image suppression tests ─────────────────────────────────── // // The earlier #796 tests only assert `result.images.is_empty()`. That field is // gated separately (extraction.rs:112) and is always empty when // `extract_images=false`, even if the `inject_placeholders` guard at line 216 is // removed. The guard controls whether `ElementKind::Image` elements are injected // into the InternalDocument — which in turn controls whether image placeholder // references (`![]()` / `![](image_N.fmt)`) appear in `result.content`. // // The Djot renderer (`djot.rs`) lacked the `doc.images.get()` None check that // comrak_bridge, html_styled, and plain all have. Removing the guard would cause // `![]()` to leak into Djot content with no test catching it. // // JSON renderer gap (out of scope): json.rs emits `{"type":"image","alt":null,"src":null}` // for orphaned elements — null fields are valid structured JSON and produce no broken // markup, so it is intentionally not addressed here. /// Djot content must not contain image markup when `extract_images=false`. /// /// End-to-end contract test: requires both the `inject_placeholders` guard in /// `extraction.rs` AND the Djot renderer's `None` guard to be absent before it /// fails. The renderer-level unit test `test_djot_renderer_skips_orphaned_image_element` /// in `djot.rs` is the minimal proof that the renderer fix works independently. #[test] fn test_djot_content_has_no_image_refs_when_extraction_disabled() { let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Djot); assert!( !result.content.contains("![]()"), "Djot output must not contain empty ![]() refs when extract_images=false.\n\ Got content:\n{}", &result.content[..result.content.len().min(400)] ); assert!( !result.content.contains("![](image_"), "Djot output must not contain image placeholder refs when extract_images=false.\n\ Got content:\n{}", &result.content[..result.content.len().min(400)] ); // Text content must still be present — no regression on extraction. assert!( !result.content.is_empty(), "Djot content must not be empty when images are disabled" ); } /// Markdown content must not contain image markup when `extract_images=false`. /// /// comrak_bridge already has a None guard so this would pass even without the /// extraction-level guard, but it pins the end-to-end contract explicitly. #[test] fn test_markdown_content_has_no_image_refs_when_extraction_disabled() { let result = extract_no_images("pdf/embedded_images_tables.pdf", OutputFormat::Markdown); assert!( !result.content.contains("![]()"), "Markdown output must not contain empty ![]() refs when extract_images=false.\n\ Got content:\n{}", &result.content[..result.content.len().min(400)] ); assert!( !result.content.contains("![](image_"), "Markdown output must not contain image placeholder refs when extract_images=false.\n\ Got content:\n{}", &result.content[..result.content.len().min(400)] ); } /// Djot content must not contain image markup when disabled via `pdf_options.extract_images`. /// /// Verifies both config paths are covered — mirrors the existing `result.images` /// test for the pdf_options path. #[test] fn test_djot_content_has_no_image_refs_when_disabled_via_pdf_options() { let result = extract_no_images_via_pdf_options("pdf/embedded_images_tables.pdf", OutputFormat::Djot); assert!( !result.content.contains("![]()"), "Djot output (pdf_options path) must not contain ![]() when extract_images=false" ); assert!( !result.content.contains("![](image_"), "Djot output (pdf_options path) must not contain image refs when extract_images=false" ); } // ─── Page-level and chunk-level image index references ──────────────────────── // // Pages carry `image_indices: Vec` — zero-based indices into the // top-level `ExtractionResult.images` collection. Chunks carry the same field. fn extract_with_pages_and_images(relative_path: &str) -> kreuzberg::types::ExtractionResult { use kreuzberg::core::config::{ImageExtractionConfig, PageConfig}; let path = test_documents_dir().join(relative_path); let config = ExtractionConfig { output_format: OutputFormat::Markdown, pages: Some(PageConfig { extract_pages: true, ..Default::default() }), images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(extract_file(&path, None, &config)).unwrap() } /// Pages that contain images must have non-empty `image_indices` pointing into /// `ExtractionResult.images`. Every index must be in-bounds. #[test] fn test_page_image_indices_are_valid_when_images_extracted() { let result = extract_with_pages_and_images("pdf/embedded_images_tables.pdf"); let images = result.images.as_ref().expect("images must be Some"); assert!(!images.is_empty(), "fixture must have extracted images"); let pages = result .pages .as_ref() .expect("pages must be Some when extract_pages=true"); assert!(!pages.is_empty(), "fixture must have pages"); // At least one page must carry image_indices (not all pages need images). let pages_with_images: Vec<_> = pages.iter().filter(|p| !p.image_indices.is_empty()).collect(); assert!( !pages_with_images.is_empty(), "at least one page must have image_indices populated when the PDF contains images" ); // Every index must be in-bounds and the referenced image must report // belonging to this page (cross-validation: wrong-page bugs would pass a // bounds-only check). for page in pages { for &idx in &page.image_indices { assert!( (idx as usize) < images.len(), "page {} image_indices[{}] = {} is out of bounds (images.len() = {})", page.page_number, idx, idx, images.len() ); let img_page = images[idx as usize].page_number; assert_eq!( img_page, Some(page.page_number), "image at index {} has page_number {:?} but is referenced by page {}", idx, img_page, page.page_number ); } } } /// `image_indices` on pages must be empty when image extraction is disabled. #[test] fn test_page_image_indices_empty_when_images_disabled() { use kreuzberg::core::config::{ImageExtractionConfig, PageConfig}; let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, pages: Some(PageConfig { extract_pages: true, ..Default::default() }), images: Some(ImageExtractionConfig { extract_images: false, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt.block_on(extract_file(&path, None, &config)).unwrap(); if let Some(pages) = result.pages.as_ref() { for page in pages { assert!( page.image_indices.is_empty(), "page {} must have no image_indices when extract_images=false", page.page_number ); } } } #[cfg(feature = "chunking")] fn extract_with_pages_images_and_chunks(relative_path: &str) -> kreuzberg::types::ExtractionResult { use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig, PageConfig}; let path = test_documents_dir().join(relative_path); let config = ExtractionConfig { output_format: OutputFormat::Markdown, pages: Some(PageConfig { extract_pages: true, ..Default::default() }), images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), chunking: Some(ChunkingConfig { max_characters: 500, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(extract_file(&path, None, &config)).unwrap() } /// Chunks that span pages containing images must have non-empty `image_indices`. /// Every index must be in-bounds, and the referenced image's `page_number` must /// fall within the chunk's `[first_page, last_page]` range. #[cfg(feature = "chunking")] #[test] fn test_chunk_image_indices_are_valid_when_images_extracted() { let result = extract_with_pages_images_and_chunks("pdf/embedded_images_tables.pdf"); let images = result.images.as_ref().expect("images must be Some"); assert!(!images.is_empty(), "fixture must have extracted images"); let chunks = result .chunks .as_ref() .expect("chunks must be Some when chunking is configured"); assert!(!chunks.is_empty(), "fixture must produce chunks"); // At least one chunk must carry image_indices. let chunks_with_images: Vec<_> = chunks.iter().filter(|c| !c.metadata.image_indices.is_empty()).collect(); assert!( !chunks_with_images.is_empty(), "at least one chunk must have image_indices when the PDF contains images" ); for chunk in chunks { for &idx in &chunk.metadata.image_indices { // In-bounds check. assert!( (idx as usize) < images.len(), "chunk image_indices[{}] = {} is out of bounds (images.len() = {})", idx, idx, images.len() ); // Cross-validation: referenced image must belong to a page within // the chunk's page range. if let (Some(first), Some(last)) = (chunk.metadata.first_page, chunk.metadata.last_page) { let img_page = images[idx as usize] .page_number .expect("image referenced by a chunk must have a page_number set"); assert!( img_page >= first && img_page <= last, "image at index {} is on page {} but chunk covers pages [{}, {}]", idx, img_page, first, last ); } } } } /// Regression for #985: max_images_per_page must cap the output count per page. /// /// Before the fix, `extract_image_positions` ran a complete decompression pass /// over every page unconditionally (even when extract_images=false), then /// `extract_images_with_data` ran a second pass. The `.take(N)` limit only /// clipped the returned slice — it did not stop the decompression work. /// /// After the fix: /// - When extract_images=false, NO decompression occurs at all (the main hang fix). /// - When extract_images=true, a single pass runs and the cap is respected in output. /// The per-page decompression cost for images beyond the cap is a pdf_oxide /// upstream limitation: `extract_images()` is eager. Eliminating that /// remaining cost requires a count-limited API upstream. #[test] fn test_max_images_per_page_cap_respected_in_output() { use kreuzberg::core::config::ImageExtractionConfig; use std::collections::HashMap; let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf"); if !path.exists() { eprintln!("skipping: test PDF not present at {}", path.display()); return; } let cap: u32 = 5; let config = ExtractionConfig { images: Some(ImageExtractionConfig { extract_images: true, max_images_per_page: Some(cap), ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt .block_on(extract_file(&path, None, &config)) .expect("extraction must succeed"); let images = result .images .as_ref() .expect("images must be Some when extract_images=true"); // Cap must be respected per page in the output. let mut per_page: HashMap = HashMap::new(); for img in images { *per_page.entry(img.page_number.unwrap_or(1)).or_default() += 1; } for (page, count) in &per_page { assert!( *count <= cap as usize, "page {page} has {count} images; cap={cap} must be respected" ); } } /// Regression for #985 (no-images case): when extract_images=false, no images /// are returned and the result is consistent with the fix. /// /// Before the fix, `extract_image_positions` ran unconditionally and triggered /// a full decompression pass over every image on every page — even when the /// caller never asked for image data. After the fix the decompression path is /// skipped entirely when images are not requested. #[test] fn test_no_images_returned_when_extraction_disabled_on_dense_pdf() { let path = test_documents_dir().join("pdf/installatiehandleiding_kombi_kompakt_hr.pdf"); if !path.exists() { eprintln!("skipping: test PDF not present at {}", path.display()); return; } let config = ExtractionConfig::default(); // extract_images defaults to false let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt .block_on(extract_file(&path, None, &config)) .expect("extraction must succeed"); // No images should be returned when extraction is disabled. assert!( result.images.is_none() || result.images.as_ref().is_some_and(|v| v.is_empty()), "images must be absent when extract_images=false" ); } /// Positions derived from extracted data must be consistent with the Markdown placeholders. /// /// When inject_placeholders=true, the renderer emits `![](image_N.ext)` links where N /// is the image_index. Every such N must have a corresponding entry in result.images. /// Also verifies that image_index values are unique — the derivation loop must not emit /// duplicate global indices. #[test] fn test_image_positions_consistent_with_image_data() { use kreuzberg::core::config::{ImageExtractionConfig, OutputFormat}; let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, images: Some(ImageExtractionConfig { extract_images: true, inject_placeholders: true, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt.block_on(extract_file(&path, None, &config)).unwrap(); let images = match result.images.as_ref() { Some(imgs) if !imgs.is_empty() => imgs, _ => return, // no images in this PDF — nothing to verify }; // image_index values must be unique across the returned set. let mut seen = std::collections::HashSet::new(); for img in images { assert!( seen.insert(img.image_index), "image_index {} appears more than once — position derivation emitted duplicates", img.image_index ); } // Every `![](image_N.ext)` placeholder in Markdown must resolve to an index in // result.images. This would fail if inject_placeholders emitted a reference for // an image that was never extracted (orphaned placeholder). let known: std::collections::HashSet = images.iter().map(|i| i.image_index).collect(); let re = regex::Regex::new(r"!\[\]\(image_(\d+)\.[a-z]+\)").unwrap(); for cap in re.captures_iter(&result.content) { let idx: u32 = cap[1].parse().unwrap(); assert!( known.contains(&idx), "Markdown contains `![](image_{idx}.ext)` but result.images has no entry \ with image_index={idx} — orphaned placeholder" ); } } /// Regression for #985 (double-decompression fix): the text-only extraction path must /// skip `extract_images_with_data` entirely. /// /// When `extract_images` is `false` (the default), `extraction.rs` must not enter the /// images branch at all — verified here by confirming that `result.images` is `None` /// (or empty) and that the call completes without decompressing any image data. /// This is the minimal structural proof that the guard in `extraction.rs` works: /// if `extract_images_with_data` were called unconditionally, the result would be /// `Some(non_empty_vec)` for a PDF that actually contains images. #[test] fn test_no_decompression_when_images_disabled() { let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); assert!(path.exists(), "missing fixture: {}", path.display()); // Default config: extract_images defaults to false. let config = ExtractionConfig::default(); let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt .block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .expect("extraction must succeed"); // The text-only path must not return any image data. assert!( result.images.as_ref().is_none_or(|v| v.is_empty()), "images must be absent on text-only extraction (extract_images=false). \ Got {} image(s) — extract_images_with_data was called when it should not have been.", result.images.as_ref().map_or(0, |v| v.len()) ); // Text content must still be present — no regression on the extraction itself. assert!( !result.content.is_empty(), "text content must still be extracted when images are disabled" ); } /// Trace-span assertion for #985: `extract_images_with_data` must NOT be entered /// when `extract_images` is false (the default). /// /// This directly proves the decompression code path was skipped — complementing /// `test_no_decompression_when_images_disabled` which only observes the output. /// An event with target `kreuzberg::pdf::oxide::images` and field /// `event = "decompression_started"` is emitted at the top of /// `extract_images_with_data`; absence of that event is structural proof the /// function was not called. #[test] fn test_no_decompression_trace_when_images_disabled() { use std::sync::{Arc, Mutex}; use tracing_subscriber::{EnvFilter, layer::SubscriberExt as _}; // ── Captured-event layer ──────────────────────────────────────────────── #[allow(clippy::type_complexity)] #[derive(Clone, Default)] struct EventCapture { events: Arc)>>>, } impl tracing_subscriber::Layer for EventCapture where S: tracing::Subscriber, { fn on_event(&self, event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>) { let target = event.metadata().target().to_owned(); // Only record events from our target to avoid unbounded accumulation. if target != "kreuzberg::pdf::oxide::images" { return; } // Walk the fields to capture the `event` key if present. struct FieldVisitor(Option); impl tracing::field::Visit for FieldVisitor { fn record_str(&mut self, field: &tracing::field::Field, value: &str) { if field.name() == "event" { self.0 = Some(value.to_owned()); } } fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { if field.name() == "event" { self.0 = Some(format!("{value:?}")); } } } let mut visitor = FieldVisitor(None); event.record(&mut visitor); self.events.lock().unwrap().push((target, visitor.0)); } } // ── Test body ─────────────────────────────────────────────────────────── let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); assert!(path.exists(), "missing fixture: {}", path.display()); let capture = EventCapture::default(); let capture_clone = capture.clone(); // Enable DEBUG so the tracing event would be visible if the function ran. let filter = EnvFilter::new("debug"); let subscriber = tracing_subscriber::registry().with(filter).with(capture_clone); // Wrap the runtime inside with_default so all spans/events are recorded. let result = tracing::subscriber::with_default(subscriber, || { let config = ExtractionConfig::default(); // extract_images defaults to false tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap() .block_on(kreuzberg::core::extractor::extract_file(&path, None, &config)) .expect("extraction must succeed") }); // Output assertion: no image data returned. assert!( result.images.as_ref().is_none_or(|v| v.is_empty()), "images must be absent when extract_images=false" ); // Trace assertion: the decompression_started event must not have fired. let events = capture.events.lock().unwrap(); let decompression_events: Vec<_> = events .iter() .filter(|(target, event_field)| { target == "kreuzberg::pdf::oxide::images" && event_field.as_deref() == Some("decompression_started") }) .collect(); assert!( decompression_events.is_empty(), "extract_images_with_data must not be entered when extract_images=false; \ got {} decompression_started event(s)", decompression_events.len() ); } // ─── ocr_inline_images decompression path ───────────────────────────────────── // // When `ocr_inline_images=true`, the extraction branch condition // `images_extraction_enabled || ocr_inline_images` is true regardless of // `extract_images`. Images are decompressed and stored in `result.images` even // when `ImageExtractionConfig.extract_images = false`. Without this test, a // regression that short-circuits the extraction when `images_extraction_enabled` // is false would go undetected. // // Note: unbounded decompression when `ocr_inline_images=true` and // `config.images=None` (no cap) is a known limitation tracked separately in // kreuzberg#989. Set `config.images.max_images_per_page` to apply a cap. /// When `ocr_inline_images=true` and `extract_images=false`, images must still /// be decompressed — `ocr_inline_images` forces entry into the extraction branch. /// /// Before the fix for #985 this path was doubly dangerous: the unconditional /// `extract_image_positions` call ran even when `extract_images=false`, and on /// the oxide path the decompression was unbounded. The OCR path was never /// covered by a test, so a regression disabling decompression for /// `ocr_inline_images=true` would be invisible. #[test] fn test_ocr_inline_images_enters_decompression_path() { use kreuzberg::PdfConfig; use kreuzberg::core::config::ImageExtractionConfig; let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); assert!(path.exists(), "missing fixture: {}", path.display()); let config = ExtractionConfig { output_format: OutputFormat::Markdown, // Explicitly disable extract_images — images_extraction_enabled will be false. images: Some(ImageExtractionConfig { extract_images: false, ..Default::default() }), // Enable ocr_inline_images — this must force entry into the extraction branch. pdf_options: Some(PdfConfig { ocr_inline_images: true, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt.block_on(extract_file(&path, None, &config)).unwrap(); // Images must be decompressed even though extract_images=false, because // ocr_inline_images=true enters the extraction branch regardless. let images = result.images.as_ref().expect( "result.images must be Some when ocr_inline_images=true, \ even if extract_images=false — the extraction branch must be entered", ); assert!( !images.is_empty(), "embedded_images_tables.pdf has embedded images; result.images must be non-empty \ when ocr_inline_images=true forces entry into the decompression branch" ); } // ─── include_page_rasters integration tests ────────────────────────────────── // // These tests exercise the full pipeline: ExtractionConfig with // include_page_rasters=true → PDF OCR via Tesseract (per-page rendering) → // merge/reindex in mod.rs → ExtractionResult.images contains PageRaster entries. // // They are the minimum proof that build_page_raster_image is actually called and // that the result survives the merge/reindex at mod.rs:501-507. The unit tests // for build_page_raster_image in ocr.rs verify the helper itself; these tests // verify the integration path. /// Enabling `include_page_rasters` on a PDF with `force_ocr=true` must produce /// `ImageKind::PageRaster` entries in `ExtractionResult.images`. /// /// Verifies: /// - At least one `PageRaster` entry is present (per-page rendering ran). /// - Every raster has `page_number = Some(N)` where N >= 1 (1-based assignment). /// - Every raster has non-empty `data` (actual PNG bytes were captured). /// - `image_index` values are unique across the full result set (reindex in /// mod.rs:501-507 did not produce collisions). /// - No `page_rasters` processing warning (Tesseract uses per-page path, not bypass). #[cfg(feature = "ocr")] #[test] fn test_include_page_rasters_produces_rasters_on_force_ocr_pdf() { use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig}; use kreuzberg::extract_file_sync; use kreuzberg::types::ImageKind; let path = test_documents_dir().join("pdf/fake_memo.pdf"); if !path.exists() { eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, images: Some(ImageExtractionConfig { include_page_rasters: true, ..Default::default() }), use_cache: false, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed"); let images = result .images .as_ref() .expect("images must be Some when include_page_rasters=true"); let rasters: Vec<_> = images .iter() .filter(|img| img.image_kind == Some(ImageKind::PageRaster)) .collect(); assert!( !rasters.is_empty(), "include_page_rasters=true must produce at least one PageRaster entry; \ got {} total images but none with PageRaster kind", images.len() ); for raster in &rasters { assert!( raster.page_number.is_some(), "PageRaster at image_index={} must have page_number set", raster.image_index ); assert!( raster.page_number.unwrap() >= 1, "PageRaster page_number must be >= 1 (1-based), got {}", raster.page_number.unwrap() ); assert!( !raster.data.is_empty(), "PageRaster at image_index={} must have non-empty PNG data", raster.image_index ); } // image_index values must be unique — reindex in mod.rs:501-507 must not collide. let mut seen = std::collections::HashSet::new(); for img in images { assert!( seen.insert(img.image_index), "image_index {} appears more than once — reindex produced duplicates", img.image_index ); } // No page_rasters warning: Tesseract processes per-page, so the bypass never fires. let raster_warnings: Vec<_> = result .processing_warnings .iter() .filter(|w| w.source.as_ref() == "page_rasters") .collect(); assert!( raster_warnings.is_empty(), "no page_rasters warning expected for Tesseract per-page OCR; got: {:?}", raster_warnings.iter().map(|w| w.message.as_ref()).collect::>() ); } /// `include_page_rasters=false` (the default) must not produce any `PageRaster` /// entries even when `force_ocr=true` triggers per-page rendering. /// /// Regression guard: the raster capture is conditional on the config flag; /// removing that condition would cause this to fail. #[cfg(feature = "ocr")] #[test] fn test_include_page_rasters_false_does_not_capture_rasters() { use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig}; use kreuzberg::extract_file_sync; use kreuzberg::types::ImageKind; let path = test_documents_dir().join("pdf/fake_memo.pdf"); if !path.exists() { eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: true, images: Some(ImageExtractionConfig { include_page_rasters: false, ..Default::default() }), use_cache: false, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("force_ocr extraction must succeed"); let raster_count = result .images .as_ref() .map(|imgs| { imgs.iter() .filter(|i| i.image_kind == Some(ImageKind::PageRaster)) .count() }) .unwrap_or(0); assert_eq!( raster_count, 0, "include_page_rasters=false must not produce any PageRaster images; got {raster_count}" ); } /// `force_ocr_pages` path through `extract_mixed_ocr_native` must also produce /// `PageRaster` entries when `include_page_rasters=true`. /// /// This exercises a different code path than `force_ocr=true`: the mixed-OCR /// path in `extract_mixed_ocr_native` (ocr.rs) encodes only the selected pages, /// not all pages. Verifies that the per-page raster capture in that path works /// end-to-end and produces correctly numbered entries. #[cfg(feature = "ocr")] #[test] fn test_include_page_rasters_on_force_ocr_pages_path() { use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig}; use kreuzberg::extract_file_sync; use kreuzberg::types::ImageKind; let path = test_documents_dir().join("pdf/fake_memo.pdf"); if !path.exists() { eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr_pages: Some(vec![1]), images: Some(ImageExtractionConfig { include_page_rasters: true, ..Default::default() }), use_cache: false, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("force_ocr_pages extraction must succeed"); let images = result .images .as_ref() .expect("images must be Some when include_page_rasters=true"); let rasters: Vec<_> = images .iter() .filter(|img| img.image_kind == Some(ImageKind::PageRaster)) .collect(); assert!( !rasters.is_empty(), "include_page_rasters=true on force_ocr_pages=[1] must produce at least one PageRaster; \ got {} total images but none with PageRaster kind", images.len() ); // Only page 1 was selected — all rasters must reference page 1. for raster in &rasters { assert_eq!( raster.page_number, Some(1), "force_ocr_pages=[1] rasters must all be page_number=1; got {:?}", raster.page_number ); assert!( !raster.data.is_empty(), "PageRaster at image_index={} must have non-empty PNG data", raster.image_index ); } // image_index values must be unique across the full result set. let mut seen = std::collections::HashSet::new(); for img in images { assert!( seen.insert(img.image_index), "image_index {} appears more than once — reindex produced duplicates", img.image_index ); } } /// When `force_ocr_pages` contains only page numbers that are out of range (e.g., /// page 99 on a 1-page PDF), `extract_mixed_ocr_native` returns `None` for rasters /// because `render_selected_pages_for_ocr` produces an empty list. This is NOT a /// document-level bypass, so no `ProcessingWarning` with `source = "page_rasters"` /// should be emitted even when `include_page_rasters=true`. #[cfg(feature = "ocr")] #[test] fn test_include_page_rasters_no_warning_on_out_of_range_pages() { use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig}; use kreuzberg::extract_file_sync; let path = test_documents_dir().join("pdf/fake_memo.pdf"); if !path.exists() { eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present"); return; } let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr_pages: Some(vec![99]), images: Some(ImageExtractionConfig { include_page_rasters: true, ..Default::default() }), use_cache: false, ..Default::default() }; let result = extract_file_sync(&path, None, &config).expect("out-of-range force_ocr_pages must not error"); let raster_warning = result .processing_warnings .iter() .find(|w| w.source.as_ref() == "page_rasters"); assert!( raster_warning.is_none(), "force_ocr_pages with all-out-of-range pages must not emit a page_rasters warning \ (no document-level bypass occurred); got: {:?}", result .processing_warnings .iter() .map(|w| (w.source.as_ref(), w.message.as_ref())) .collect::>() ); } /// When `include_page_rasters=true` but the active OCR backend uses document-level /// processing (bypassing per-page rendering), a `ProcessingWarning` with /// `source = "page_rasters"` must be emitted. /// /// This is the only scenario where `None` rasters flow through the `force_ocr` /// path while `used_ocr=true`. Exercises the warning guard in /// `PdfExtractor::extract` (mod.rs). /// /// A mock backend with `supports_document_processing() = true` is registered so /// that `extract_with_ocr` takes the document-level bypass instead of per-page /// rendering. The mock returns an empty result — enough to trigger the warning. #[cfg(feature = "ocr")] #[test] fn test_include_page_rasters_emits_warning_on_document_level_ocr_bypass() { use kreuzberg::core::config::{ImageExtractionConfig, OcrConfig}; use kreuzberg::core::extractor::extract_file; use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin}; use kreuzberg::types::ExtractionResult; use std::path::Path; use std::sync::Arc; let pdf_path = test_documents_dir().join("pdf/fake_memo.pdf"); if !pdf_path.exists() { eprintln!("SKIP: test_documents/pdf/fake_memo.pdf not present"); return; } struct DocLevelMock; #[async_trait::async_trait] impl OcrBackend for DocLevelMock { fn backend_type(&self) -> OcrBackendType { OcrBackendType::Custom } fn supports_language(&self, _: &str) -> bool { true } async fn process_image(&self, _: &[u8], _: &OcrConfig) -> kreuzberg::Result { panic!("process_image must not be called for a document-level backend") } fn supports_document_processing(&self) -> bool { true } async fn process_document(&self, _: &Path, _: &OcrConfig) -> kreuzberg::Result { Ok(ExtractionResult::default()) } } impl Plugin for DocLevelMock { fn name(&self) -> &str { "doc-level-mock-raster-warn" } fn version(&self) -> String { "1.0.0".to_string() } fn initialize(&self) -> kreuzberg::Result<()> { Ok(()) } fn shutdown(&self) -> kreuzberg::Result<()> { Ok(()) } } kreuzberg::plugins::register_ocr_backend(Arc::new(DocLevelMock)).unwrap(); let config = ExtractionConfig { ocr: Some(OcrConfig { backend: "doc-level-mock-raster-warn".to_string(), ..Default::default() }), force_ocr: true, images: Some(ImageExtractionConfig { include_page_rasters: true, ..Default::default() }), use_cache: false, ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt .block_on(extract_file(&pdf_path, None, &config)) .expect("document-level OCR mock must succeed"); kreuzberg::plugins::unregister_ocr_backend("doc-level-mock-raster-warn").unwrap(); let raster_warning = result .processing_warnings .iter() .find(|w| w.source.as_ref() == "page_rasters"); assert!( raster_warning.is_some(), "expected a page_rasters ProcessingWarning when include_page_rasters=true \ and OCR backend uses document-level processing; got warnings: {:?}", result .processing_warnings .iter() .map(|w| (w.source.as_ref(), w.message.as_ref())) .collect::>() ); } /// `image_indices` on chunks must be empty when image extraction is disabled. #[cfg(feature = "chunking")] #[test] fn test_chunk_image_indices_empty_when_images_disabled() { use kreuzberg::core::config::{ChunkingConfig, ImageExtractionConfig}; let path = test_documents_dir().join("pdf/embedded_images_tables.pdf"); let config = ExtractionConfig { output_format: OutputFormat::Markdown, images: Some(ImageExtractionConfig { extract_images: false, ..Default::default() }), chunking: Some(ChunkingConfig { max_characters: 500, ..Default::default() }), ..Default::default() }; let rt = tokio::runtime::Runtime::new().unwrap(); let result = rt.block_on(extract_file(&path, None, &config)).unwrap(); if let Some(chunks) = result.chunks.as_ref() { for chunk in chunks { assert!( chunk.metadata.image_indices.is_empty(), "chunk must have no image_indices when extract_images=false" ); } } }