//! Regression tests for PPTX/PPSX extraction bugs
//!
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
//!
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
#![cfg(feature = "office")]
use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
use std::io::Write;
use tempfile::NamedTempFile;
use zip::CompressionMethod;
use zip::write::{FileOptions, ZipWriter};
/// Test that PPSX (PowerPoint Show) files are extracted correctly.
///
/// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
/// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
///
/// The internal structure is identical to PPTX - same slide XML format.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_slideshow_extraction() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");
if !test_file.exists() {
println!("Skipping test: PPSX test file not found at {:?}", test_file);
return;
}
let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
println!("✅ PPSX extraction succeeded!");
println!(" Content length: {} chars", extraction.content.len());
println!(
" Content preview: {}",
&extraction.content[..extraction.content.len().min(200)]
);
}
Err(e) => {
panic!(
"PPSX extraction failed with error: {:?}\n\
This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
needs to be added to extension-to-MIME mapping.",
e
);
}
}
}
/// Test that PPSX files can be extracted when MIME type is explicitly provided.
///
/// This validates that the PPTX extractor can handle PPSX content correctly
/// (the XML structure is identical), even if MIME detection fails.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_with_explicit_mime_type() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");
if !test_file.exists() {
println!("Skipping test: PPSX test file not found at {:?}", test_file);
return;
}
// Explicitly provide the PPSX MIME type
let result = extract_file(
&test_file,
Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
println!("✅ PPSX extraction with explicit MIME type succeeded!");
}
Err(e) => {
panic!(
"PPSX extraction with explicit MIME type failed: {:?}\n\
The PPTX extractor should handle PPSX content (identical XML structure).",
e
);
}
}
}
/// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
///
/// Some shapes in PPTX files, like image placeholders (``), don't have
/// `` children because they're designed to hold images, not text.
///
/// The parser should skip shapes without txBody gracefully instead of failing.
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_with_image_placeholder_no_txbody() {
// Create a minimal PPTX with a shape that has no txBody (image placeholder)
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
// This is the critical test case - a element with no
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
This is the title text
Content after image placeholder
"#,
)
.expect("Operation failed");
// Add ppt/slides/_rels/slide1.xml.rels (empty)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
// Extract the PPTX file
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "Content should not be empty");
// Verify we extracted text from shapes that DO have txBody
assert!(
extraction.content.contains("title text"),
"Should extract text from first shape with txBody. Got: {}",
extraction.content
);
assert!(
extraction.content.contains("Content after"),
"Should extract text from shape after image placeholder. Got: {}",
extraction.content
);
println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
println!(" Content: {}", extraction.content);
}
Err(e) => {
let error_msg = format!("{:?}", e);
if error_msg.contains("No txBody found") {
panic!(
"PPTX extraction failed with 'No txBody found' error!\n\
This is GitHub Issue #321 Bug 1.\n\
The parser should skip shapes without txBody (image placeholders) \
instead of failing.\n\
Error: {:?}",
e
);
} else {
panic!("PPTX extraction failed with unexpected error: {:?}", e);
}
}
}
}
/// Test extraction of PPTX with multiple shapes, some with txBody, some without.
///
/// This test verifies that:
/// 1. Shapes WITH txBody are extracted
/// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
/// 3. The extraction continues and doesn't fail on the first shape without txBody
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_mixed_shapes_extraction() {
// Create a PPTX with multiple slides, each containing mixed shapes
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add slide with various shapes - some with txBody, some without
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
First Text Shape
Second Text Shape
Third Text Shape
"#,
)
.expect("Operation failed");
// Add empty rels
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
// All three text shapes should be extracted
assert!(
extraction.content.contains("First Text Shape"),
"Should extract first text shape"
);
assert!(
extraction.content.contains("Second Text Shape"),
"Should extract second text shape (after image placeholder)"
);
assert!(
extraction.content.contains("Third Text Shape"),
"Should extract third text shape (after multiple placeholders)"
);
println!("✅ PPTX mixed shapes extraction succeeded!");
println!(" All text shapes extracted despite image/chart/content placeholders without txBody");
}
Err(e) => {
panic!(
"PPTX extraction failed: {:?}\n\
Shapes without txBody should be skipped gracefully.",
e
);
}
}
}
/// Test that images extracted from PPTX have correct page numbers.
///
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
/// should have page_number=1 (not reversed).
///
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
#[tokio::test]
async fn test_pptx_image_page_numbers_not_reversed() {
// Create a PPTX with 2 slides, image on slide 1
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
// A minimal 1x1 red PNG image (valid PNG format)
let png_image: &[u8] = &[
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
0x49, 0x48, 0x44, 0x52, // "IHDR"
0x00, 0x00, 0x00, 0x01, // width: 1
0x00, 0x00, 0x00, 0x01, // height: 1
0x08, 0x02, // bit depth: 8, color type: RGB
0x00, 0x00, 0x00, // compression, filter, interlace
0x90, 0x77, 0x53, 0xDE, // IHDR CRC
0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
0x49, 0x44, 0x41, 0x54, // "IDAT"
0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
0x01, 0x01, 0x01, 0x00, // checksum
0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
0x00, 0x00, 0x00, 0x00, // IEND chunk length
0x49, 0x45, 0x4E, 0x44, // "IEND"
0xAE, 0x42, 0x60, 0x82, // IEND CRC
];
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
// BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
// This is valid XML - PowerPoint doesn't guarantee order in rels files
// GitHub Issue #329: This causes page numbers to be reversed
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add the image file
zip.start_file("ppt/media/image1.png", options)
.expect("Operation failed");
zip.write_all(png_image).expect("Operation failed");
// Add slide 1 WITH an image
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
Slide 1 - Has Image
"#,
)
.expect("Operation failed");
// Add slide 1 relationships (points to the image)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add slide 2 WITHOUT an image
zip.start_file("ppt/slides/slide2.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
Slide 2 - No Image
"#,
)
.expect("Operation failed");
// Add empty slide 2 relationships
zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
// Extract with images enabled
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&config,
)
.await;
match result {
Ok(extraction) => {
// Verify text extraction works
assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
// Verify we got an image
let images = extraction.images.as_ref().expect("Images should be present");
assert!(!images.is_empty(), "Should extract at least one image");
// THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
let image = &images[0];
assert_eq!(
image.page_number,
Some(1),
"GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
The page numbers are reversed!",
image.page_number
);
println!("✅ PPTX image page numbers are correct!");
println!(" Image on slide 1 has page_number={:?}", image.page_number);
}
Err(e) => {
panic!("PPTX extraction failed: {:?}", e);
}
}
}
/// Test with actual user-provided PPTX file from GitHub Issue #329.
///
/// The user's file has slides listed in reverse order in presentation.xml.rels,
/// which caused images to have incorrect page numbers.
#[tokio::test]
async fn test_pptx_image_page_numbers_issue329_user_file() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/pptx_reversed_slide_order_issue329.pptx");
if !test_file.exists() {
println!("Skipping test: User file not found at {:?}", test_file);
return;
}
// Extract with images enabled
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file(&test_file, None, &config).await;
match result {
Ok(extraction) => {
// The user's file has an image on slide 1
let images = extraction.images.as_ref().expect("Images should be extracted");
if images.is_empty() {
println!("No images extracted from user file (may not have embedded images)");
return;
}
// All images should have page_number = 1 since they're on the first slide
for (idx, image) in images.iter().enumerate() {
assert_eq!(
image.page_number,
Some(1),
"GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
idx,
image.page_number
);
}
println!("✅ User file from Issue #329 - image page numbers correct!");
println!(" Found {} images, all with page_number=1", images.len());
}
Err(e) => {
panic!("Failed to extract user file: {:?}", e);
}
}
}
/// Test that a `` element whose `` lacks `r:embed` is skipped
/// gracefully instead of failing the entire page.
///
/// This is a regression test for PR #1016: before the fix, `parse_pic(node)?`
/// propagated the error, aborting extraction of the whole slide. After the fix,
/// the broken image is logged and skipped while the rest of the slide content
/// is preserved.
#[tokio::test]
async fn test_pptx_broken_image_blip_missing_embed_skipped_gracefully() {
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"
"#).expect("Operation failed");
// Add ppt/slides/slide1.xml
// KEY TEST CASE: inside does NOT have r:embed attribute.
// The slide also has text shapes before and after the broken image.
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"
Text before broken image
Text after broken image
"#,
)
.expect("Operation failed");
// Add ppt/slides/_rels/slide1.xml.rels (empty - no image rels needed for this test)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"
"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "Content should not be empty");
// Verify text BEFORE the broken image is extracted
assert!(
extraction.content.contains("Text before broken image"),
"Should preserve text before broken image. Got: {}",
extraction.content
);
// Verify text AFTER the broken image is extracted
assert!(
extraction.content.contains("Text after broken image"),
"Should preserve text after broken image. Got: {}",
extraction.content
);
println!("✅ PPTX with broken image (blip missing r:embed) extraction succeeded!");
println!(" Content: {}", extraction.content);
}
Err(e) => {
let error_msg = format!("{:?}", e);
if error_msg.contains("Image embed attribute not found") {
panic!(
"PPTX extraction failed with 'Image embed attribute not found' error!\n\
This is the regression for PR #1016.\n\
The parser should skip elements whose lacks r:embed\n\
instead of failing the entire page.\n\
Error: {:?}",
e
);
} else {
panic!("PPTX extraction failed with unexpected error: {:?}", e);
}
}
}
}