//! Regression tests for PPTX/PPSX extraction bugs //! //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported //! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers //! //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely //! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2 #![cfg(feature = "office")] use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file}; use std::io::Write; use tempfile::NamedTempFile; use zip::CompressionMethod; use zip::write::{FileOptions, ZipWriter}; /// Test that PPSX (PowerPoint Show) files are extracted correctly. /// /// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow` /// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`. /// /// The internal structure is identical to PPTX - same slide XML format. /// /// GitHub Issue #321 Bug 2 #[tokio::test] async fn test_ppsx_slideshow_extraction() { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); let test_file = workspace_root.join("test_documents/pptx/sample.ppsx"); if !test_file.exists() { println!("Skipping test: PPSX test file not found at {:?}", test_file); return; } let result = extract_file(&test_file, None, &ExtractionConfig::default()).await; match result { Ok(extraction) => { assert!(!extraction.content.is_empty(), "PPSX content should not be empty"); println!("✅ PPSX extraction succeeded!"); println!(" Content length: {} chars", extraction.content.len()); println!( " Content preview: {}", &extraction.content[..extraction.content.len().min(200)] ); } Err(e) => { panic!( "PPSX extraction failed with error: {:?}\n\ This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\ PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \ needs to be added to extension-to-MIME mapping.", e ); } } } /// Test that PPSX files can be extracted when MIME type is explicitly provided. /// /// This validates that the PPTX extractor can handle PPSX content correctly /// (the XML structure is identical), even if MIME detection fails. /// /// GitHub Issue #321 Bug 2 #[tokio::test] async fn test_ppsx_with_explicit_mime_type() { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); let test_file = workspace_root.join("test_documents/pptx/sample.ppsx"); if !test_file.exists() { println!("Skipping test: PPSX test file not found at {:?}", test_file); return; } // Explicitly provide the PPSX MIME type let result = extract_file( &test_file, Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"), &ExtractionConfig::default(), ) .await; match result { Ok(extraction) => { assert!(!extraction.content.is_empty(), "PPSX content should not be empty"); println!("✅ PPSX extraction with explicit MIME type succeeded!"); } Err(e) => { panic!( "PPSX extraction with explicit MIME type failed: {:?}\n\ The PPTX extractor should handle PPSX content (identical XML structure).", e ); } } } /// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly. /// /// Some shapes in PPTX files, like image placeholders (``), don't have /// `` children because they're designed to hold images, not text. /// /// The parser should skip shapes without txBody gracefully instead of failing. /// /// GitHub Issue #321 Bug 1 #[tokio::test] async fn test_pptx_with_image_placeholder_no_txbody() { // Create a minimal PPTX with a shape that has no txBody (image placeholder) let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file"); { let mut zip = ZipWriter::new(&mut temp_file); let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add _rels/.rels zip.start_file("_rels/.rels", options).expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/presentation.xml zip.start_file("ppt/presentation.xml", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); // Add ppt/_rels/presentation.xml.rels zip.start_file("ppt/_rels/presentation.xml.rels", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder) // This is the critical test case - a element with no zip.start_file("ppt/slides/slide1.xml", options) .expect("Operation failed"); zip.write_all( br#" This is the title text Content after image placeholder "#, ) .expect("Operation failed"); // Add ppt/slides/_rels/slide1.xml.rels (empty) zip.start_file("ppt/slides/_rels/slide1.xml.rels", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); zip.finish().expect("Operation failed"); } // Extract the PPTX file let result = extract_file( temp_file.path(), Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"), &ExtractionConfig::default(), ) .await; match result { Ok(extraction) => { assert!(!extraction.content.is_empty(), "Content should not be empty"); // Verify we extracted text from shapes that DO have txBody assert!( extraction.content.contains("title text"), "Should extract text from first shape with txBody. Got: {}", extraction.content ); assert!( extraction.content.contains("Content after"), "Should extract text from shape after image placeholder. Got: {}", extraction.content ); println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!"); println!(" Content: {}", extraction.content); } Err(e) => { let error_msg = format!("{:?}", e); if error_msg.contains("No txBody found") { panic!( "PPTX extraction failed with 'No txBody found' error!\n\ This is GitHub Issue #321 Bug 1.\n\ The parser should skip shapes without txBody (image placeholders) \ instead of failing.\n\ Error: {:?}", e ); } else { panic!("PPTX extraction failed with unexpected error: {:?}", e); } } } } /// Test extraction of PPTX with multiple shapes, some with txBody, some without. /// /// This test verifies that: /// 1. Shapes WITH txBody are extracted /// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully /// 3. The extraction continues and doesn't fail on the first shape without txBody /// /// GitHub Issue #321 Bug 1 #[tokio::test] async fn test_pptx_mixed_shapes_extraction() { // Create a PPTX with multiple slides, each containing mixed shapes let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file"); { let mut zip = ZipWriter::new(&mut temp_file); let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add _rels/.rels zip.start_file("_rels/.rels", options).expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/presentation.xml zip.start_file("ppt/presentation.xml", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); // Add ppt/_rels/presentation.xml.rels zip.start_file("ppt/_rels/presentation.xml.rels", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add slide with various shapes - some with txBody, some without zip.start_file("ppt/slides/slide1.xml", options) .expect("Operation failed"); zip.write_all( br#" First Text Shape Second Text Shape Third Text Shape "#, ) .expect("Operation failed"); // Add empty rels zip.start_file("ppt/slides/_rels/slide1.xml.rels", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); zip.finish().expect("Operation failed"); } let result = extract_file( temp_file.path(), Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"), &ExtractionConfig::default(), ) .await; match result { Ok(extraction) => { // All three text shapes should be extracted assert!( extraction.content.contains("First Text Shape"), "Should extract first text shape" ); assert!( extraction.content.contains("Second Text Shape"), "Should extract second text shape (after image placeholder)" ); assert!( extraction.content.contains("Third Text Shape"), "Should extract third text shape (after multiple placeholders)" ); println!("✅ PPTX mixed shapes extraction succeeded!"); println!(" All text shapes extracted despite image/chart/content placeholders without txBody"); } Err(e) => { panic!( "PPTX extraction failed: {:?}\n\ Shapes without txBody should be skipped gracefully.", e ); } } } /// Test that images extracted from PPTX have correct page numbers. /// /// When a PPTX has multiple slides and an image on slide 1, the extracted image /// should have page_number=1 (not reversed). /// /// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2 #[tokio::test] async fn test_pptx_image_page_numbers_not_reversed() { // Create a PPTX with 2 slides, image on slide 1 let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file"); // A minimal 1x1 red PNG image (valid PNG format) let png_image: &[u8] = &[ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature 0x00, 0x00, 0x00, 0x0D, // IHDR chunk length 0x49, 0x48, 0x44, 0x52, // "IHDR" 0x00, 0x00, 0x00, 0x01, // width: 1 0x00, 0x00, 0x00, 0x01, // height: 1 0x08, 0x02, // bit depth: 8, color type: RGB 0x00, 0x00, 0x00, // compression, filter, interlace 0x90, 0x77, 0x53, 0xDE, // IHDR CRC 0x00, 0x00, 0x00, 0x0C, // IDAT chunk length 0x49, 0x44, 0x41, 0x54, // "IDAT" 0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data 0x01, 0x01, 0x01, 0x00, // checksum 0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC 0x00, 0x00, 0x00, 0x00, // IEND chunk length 0x49, 0x45, 0x4E, 0x44, // "IEND" 0xAE, 0x42, 0x60, 0x82, // IEND CRC ]; { let mut zip = ZipWriter::new(&mut temp_file); let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add _rels/.rels zip.start_file("_rels/.rels", options).expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/presentation.xml zip.start_file("ppt/presentation.xml", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); // Add ppt/_rels/presentation.xml.rels // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1) // This is valid XML - PowerPoint doesn't guarantee order in rels files // GitHub Issue #329: This causes page numbers to be reversed zip.start_file("ppt/_rels/presentation.xml.rels", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add the image file zip.start_file("ppt/media/image1.png", options) .expect("Operation failed"); zip.write_all(png_image).expect("Operation failed"); // Add slide 1 WITH an image zip.start_file("ppt/slides/slide1.xml", options) .expect("Operation failed"); zip.write_all( br#" Slide 1 - Has Image "#, ) .expect("Operation failed"); // Add slide 1 relationships (points to the image) zip.start_file("ppt/slides/_rels/slide1.xml.rels", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add slide 2 WITHOUT an image zip.start_file("ppt/slides/slide2.xml", options) .expect("Operation failed"); zip.write_all( br#" Slide 2 - No Image "#, ) .expect("Operation failed"); // Add empty slide 2 relationships zip.start_file("ppt/slides/_rels/slide2.xml.rels", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); zip.finish().expect("Operation failed"); } // Extract with images enabled let config = ExtractionConfig { images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), ..Default::default() }; let result = extract_file( temp_file.path(), Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"), &config, ) .await; match result { Ok(extraction) => { // Verify text extraction works assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text"); assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text"); // Verify we got an image let images = extraction.images.as_ref().expect("Images should be present"); assert!(!images.is_empty(), "Should extract at least one image"); // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2 let image = &images[0]; assert_eq!( image.page_number, Some(1), "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \ The page numbers are reversed!", image.page_number ); println!("✅ PPTX image page numbers are correct!"); println!(" Image on slide 1 has page_number={:?}", image.page_number); } Err(e) => { panic!("PPTX extraction failed: {:?}", e); } } } /// Test with actual user-provided PPTX file from GitHub Issue #329. /// /// The user's file has slides listed in reverse order in presentation.xml.rels, /// which caused images to have incorrect page numbers. #[tokio::test] async fn test_pptx_image_page_numbers_issue329_user_file() { let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) .parent() .expect("Operation failed") .parent() .expect("Operation failed"); let test_file = workspace_root.join("test_documents/pptx/pptx_reversed_slide_order_issue329.pptx"); if !test_file.exists() { println!("Skipping test: User file not found at {:?}", test_file); return; } // Extract with images enabled let config = ExtractionConfig { images: Some(ImageExtractionConfig { extract_images: true, ..Default::default() }), ..Default::default() }; let result = extract_file(&test_file, None, &config).await; match result { Ok(extraction) => { // The user's file has an image on slide 1 let images = extraction.images.as_ref().expect("Images should be extracted"); if images.is_empty() { println!("No images extracted from user file (may not have embedded images)"); return; } // All images should have page_number = 1 since they're on the first slide for (idx, image) in images.iter().enumerate() { assert_eq!( image.page_number, Some(1), "GitHub Issue #329: Image {} should have page_number=1, but got {:?}", idx, image.page_number ); } println!("✅ User file from Issue #329 - image page numbers correct!"); println!(" Found {} images, all with page_number=1", images.len()); } Err(e) => { panic!("Failed to extract user file: {:?}", e); } } } /// Test that a `` element whose `` lacks `r:embed` is skipped /// gracefully instead of failing the entire page. /// /// This is a regression test for PR #1016: before the fix, `parse_pic(node)?` /// propagated the error, aborting extraction of the whole slide. After the fix, /// the broken image is logged and skipped while the rest of the slide content /// is preserved. #[tokio::test] async fn test_pptx_broken_image_blip_missing_embed_skipped_gracefully() { let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file"); { let mut zip = ZipWriter::new(&mut temp_file); let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored); // Add [Content_Types].xml zip.start_file("[Content_Types].xml", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add _rels/.rels zip.start_file("_rels/.rels", options).expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/presentation.xml zip.start_file("ppt/presentation.xml", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); // Add ppt/_rels/presentation.xml.rels zip.start_file("ppt/_rels/presentation.xml.rels", options) .expect("Operation failed"); zip.write_all(br#" "#).expect("Operation failed"); // Add ppt/slides/slide1.xml // KEY TEST CASE: inside does NOT have r:embed attribute. // The slide also has text shapes before and after the broken image. zip.start_file("ppt/slides/slide1.xml", options) .expect("Operation failed"); zip.write_all( br#" Text before broken image Text after broken image "#, ) .expect("Operation failed"); // Add ppt/slides/_rels/slide1.xml.rels (empty - no image rels needed for this test) zip.start_file("ppt/slides/_rels/slide1.xml.rels", options) .expect("Operation failed"); zip.write_all( br#" "#, ) .expect("Operation failed"); zip.finish().expect("Operation failed"); } let result = extract_file( temp_file.path(), Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"), &ExtractionConfig::default(), ) .await; match result { Ok(extraction) => { assert!(!extraction.content.is_empty(), "Content should not be empty"); // Verify text BEFORE the broken image is extracted assert!( extraction.content.contains("Text before broken image"), "Should preserve text before broken image. Got: {}", extraction.content ); // Verify text AFTER the broken image is extracted assert!( extraction.content.contains("Text after broken image"), "Should preserve text after broken image. Got: {}", extraction.content ); println!("✅ PPTX with broken image (blip missing r:embed) extraction succeeded!"); println!(" Content: {}", extraction.content); } Err(e) => { let error_msg = format!("{:?}", e); if error_msg.contains("Image embed attribute not found") { panic!( "PPTX extraction failed with 'Image embed attribute not found' error!\n\ This is the regression for PR #1016.\n\ The parser should skip elements whose lacks r:embed\n\ instead of failing the entire page.\n\ Error: {:?}", e ); } else { panic!("PPTX extraction failed with unexpected error: {:?}", e); } } } }