162 lines
5.2 KiB
Rust
162 lines
5.2 KiB
Rust
|
|
//! Regression tests for https://github.com/kreuzberg-dev/kreuzberg/issues/671
|
||
|
|
//!
|
||
|
|
//! ImageExtractionConfig.inject_placeholders was silently ignored on PPTX:
|
||
|
|
//! setting inject_placeholders=False had no effect and all three configs
|
||
|
|
//! (default / inject_placeholders=False / extract_images=False) produced
|
||
|
|
//! byte-identical output that still contained 34 `` refs.
|
||
|
|
//!
|
||
|
|
//! Root cause: inject_placeholders was defined in ImageExtractionConfig but
|
||
|
|
//! never read. Image references were injected unconditionally in
|
||
|
|
//! Slide::to_markdown regardless of the flag value.
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
use kreuzberg::core::config::{ExtractionConfig, ImageExtractionConfig, OutputFormat};
|
||
|
|
use kreuzberg::core::extractor::extract_file_sync;
|
||
|
|
use std::path::Path;
|
||
|
|
|
||
|
|
fn extract_md(path: &Path, config: ExtractionConfig) -> String {
|
||
|
|
if !path.exists() {
|
||
|
|
return String::new();
|
||
|
|
}
|
||
|
|
extract_file_sync(path, None, &config)
|
||
|
|
.expect("extraction must succeed")
|
||
|
|
.content
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Core invariant: inject_placeholders=false must produce output with
|
||
|
|
/// fewer (or equal) `![` occurrences than the default config.
|
||
|
|
#[test]
|
||
|
|
fn test_inject_placeholders_false_removes_image_refs() {
|
||
|
|
let path = Path::new("test_documents/pptx/powerpoint_with_image.pptx");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("Skipping: fixture not found at {}", path.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let default_config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let no_placeholder_config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
inject_placeholders: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let default_out = extract_md(path, default_config);
|
||
|
|
let no_ph_out = extract_md(path, no_placeholder_config);
|
||
|
|
|
||
|
|
// Skip if this PPTX has no images (test would be vacuously true)
|
||
|
|
if !default_out.contains("![") {
|
||
|
|
eprintln!("Skipping: fixture contains no image references");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let default_refs = default_out.matches("![").count();
|
||
|
|
let no_ph_refs = no_ph_out.matches("![").count();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
no_ph_refs < default_refs,
|
||
|
|
"inject_placeholders=false must reduce image reference count \
|
||
|
|
(got {} vs {} with default). The flag is still being ignored.",
|
||
|
|
no_ph_refs,
|
||
|
|
default_refs,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// inject_placeholders=false must not drop non-image text content.
|
||
|
|
#[test]
|
||
|
|
fn test_inject_placeholders_false_preserves_text_content() {
|
||
|
|
let path = Path::new("test_documents/pptx/powerpoint_with_image.pptx");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("Skipping: fixture not found at {}", path.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let default_config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
let no_placeholder_config = ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
inject_placeholders: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
let default_out = extract_md(path, default_config);
|
||
|
|
let no_ph_out = extract_md(path, no_placeholder_config);
|
||
|
|
|
||
|
|
// Strip image markdown refs from both outputs and compare word counts.
|
||
|
|
let strip_img = |s: &str| {
|
||
|
|
s.lines()
|
||
|
|
.filter(|l| !l.trim_start().starts_with("!["))
|
||
|
|
.collect::<Vec<_>>()
|
||
|
|
.join("\n")
|
||
|
|
};
|
||
|
|
|
||
|
|
let default_words = strip_img(&default_out).split_whitespace().count();
|
||
|
|
let no_ph_words = strip_img(&no_ph_out).split_whitespace().count();
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
no_ph_words >= default_words.saturating_sub(5),
|
||
|
|
"Text content must be preserved when inject_placeholders=false \
|
||
|
|
(got {} vs {} words after stripping image lines)",
|
||
|
|
no_ph_words,
|
||
|
|
default_words,
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// The three configs from the issue repro must NOT all be byte-identical when
|
||
|
|
/// the PPTX has images. Specifically, inject_placeholders=false must differ.
|
||
|
|
#[test]
|
||
|
|
fn test_configs_produce_different_output() {
|
||
|
|
let path = Path::new("test_documents/pptx/powerpoint_with_image.pptx");
|
||
|
|
if !path.exists() {
|
||
|
|
eprintln!("Skipping: fixture not found at {}", path.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let default_out = extract_md(
|
||
|
|
path,
|
||
|
|
ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
..Default::default()
|
||
|
|
},
|
||
|
|
);
|
||
|
|
let no_ph_out = extract_md(
|
||
|
|
path,
|
||
|
|
ExtractionConfig {
|
||
|
|
output_format: OutputFormat::Markdown,
|
||
|
|
use_cache: false,
|
||
|
|
images: Some(ImageExtractionConfig {
|
||
|
|
inject_placeholders: false,
|
||
|
|
..Default::default()
|
||
|
|
}),
|
||
|
|
..Default::default()
|
||
|
|
},
|
||
|
|
);
|
||
|
|
|
||
|
|
if !default_out.contains("![") {
|
||
|
|
eprintln!("Skipping: fixture contains no image references");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
assert_ne!(
|
||
|
|
default_out, no_ph_out,
|
||
|
|
"default config and inject_placeholders=false must produce different output \
|
||
|
|
when the PPTX contains images. The flag is still being ignored."
|
||
|
|
);
|
||
|
|
}
|