Files
fil/crates/kreuzberg/tests/pptx_regression_tests.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1017 lines
38 KiB
Rust

//! Regression tests for PPTX/PPSX extraction bugs
//!
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
//!
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
#![cfg(feature = "office")]
use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
use std::io::Write;
use tempfile::NamedTempFile;
use zip::CompressionMethod;
use zip::write::{FileOptions, ZipWriter};
/// Test that PPSX (PowerPoint Show) files are extracted correctly.
///
/// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
/// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
///
/// The internal structure is identical to PPTX - same slide XML format.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_slideshow_extraction() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");
if !test_file.exists() {
println!("Skipping test: PPSX test file not found at {:?}", test_file);
return;
}
let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
println!("✅ PPSX extraction succeeded!");
println!(" Content length: {} chars", extraction.content.len());
println!(
" Content preview: {}",
&extraction.content[..extraction.content.len().min(200)]
);
}
Err(e) => {
panic!(
"PPSX extraction failed with error: {:?}\n\
This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
needs to be added to extension-to-MIME mapping.",
e
);
}
}
}
/// Test that PPSX files can be extracted when MIME type is explicitly provided.
///
/// This validates that the PPTX extractor can handle PPSX content correctly
/// (the XML structure is identical), even if MIME detection fails.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_with_explicit_mime_type() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");
if !test_file.exists() {
println!("Skipping test: PPSX test file not found at {:?}", test_file);
return;
}
// Explicitly provide the PPSX MIME type
let result = extract_file(
&test_file,
Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
println!("✅ PPSX extraction with explicit MIME type succeeded!");
}
Err(e) => {
panic!(
"PPSX extraction with explicit MIME type failed: {:?}\n\
The PPTX extractor should handle PPSX content (identical XML structure).",
e
);
}
}
}
/// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
///
/// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
/// `<p:txBody>` children because they're designed to hold images, not text.
///
/// The parser should skip shapes without txBody gracefully instead of failing.
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_with_image_placeholder_no_txbody() {
// Create a minimal PPTX with a shape that has no txBody (image placeholder)
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<p:sldIdLst>
<p:sldId id="256" r:id="rId2"/>
</p:sldIdLst>
</p:presentation>"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
// This is the critical test case - a <p:sp> element with no <p:txBody>
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:nvGrpSpPr>
<p:cNvPr id="1" name=""/>
<p:cNvGrpSpPr/>
<p:nvPr/>
</p:nvGrpSpPr>
<p:grpSpPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="0" cy="0"/>
<a:chOff x="0" y="0"/>
<a:chExt cx="0" cy="0"/>
</a:xfrm>
</p:grpSpPr>
<!-- Normal text shape WITH txBody - this should be extracted -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="2" name="Title"/>
<p:cNvSpPr/>
<p:nvPr/>
</p:nvSpPr>
<p:spPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
<p:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:r>
<a:rPr lang="en-US"/>
<a:t>This is the title text</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
<!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
<!-- This is a valid PPTX structure - image placeholders don't contain text -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="99" name="Image Placeholder"/>
<p:cNvSpPr>
<a:spLocks noGrp="1"/>
</p:cNvSpPr>
<p:nvPr>
<p:ph type="pic" idx="1"/>
</p:nvPr>
</p:nvSpPr>
<p:spPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
<!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
</p:sp>
<!-- Another normal text shape - should also be extracted -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="3" name="Content"/>
<p:cNvSpPr/>
<p:nvPr/>
</p:nvSpPr>
<p:spPr>
<a:xfrm>
<a:off x="0" y="200000"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
<p:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:r>
<a:rPr lang="en-US"/>
<a:t>Content after image placeholder</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#,
)
.expect("Operation failed");
// Add ppt/slides/_rels/slide1.xml.rels (empty)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
// Extract the PPTX file
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "Content should not be empty");
// Verify we extracted text from shapes that DO have txBody
assert!(
extraction.content.contains("title text"),
"Should extract text from first shape with txBody. Got: {}",
extraction.content
);
assert!(
extraction.content.contains("Content after"),
"Should extract text from shape after image placeholder. Got: {}",
extraction.content
);
println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
println!(" Content: {}", extraction.content);
}
Err(e) => {
let error_msg = format!("{:?}", e);
if error_msg.contains("No txBody found") {
panic!(
"PPTX extraction failed with 'No txBody found' error!\n\
This is GitHub Issue #321 Bug 1.\n\
The parser should skip shapes without txBody (image placeholders) \
instead of failing.\n\
Error: {:?}",
e
);
} else {
panic!("PPTX extraction failed with unexpected error: {:?}", e);
}
}
}
}
/// Test extraction of PPTX with multiple shapes, some with txBody, some without.
///
/// This test verifies that:
/// 1. Shapes WITH txBody are extracted
/// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
/// 3. The extraction continues and doesn't fail on the first shape without txBody
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_mixed_shapes_extraction() {
// Create a PPTX with multiple slides, each containing mixed shapes
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<p:sldIdLst>
<p:sldId id="256" r:id="rId2"/>
</p:sldIdLst>
</p:presentation>"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");
// Add slide with various shapes - some with txBody, some without
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:nvGrpSpPr>
<p:cNvPr id="1" name=""/>
<p:cNvGrpSpPr/>
<p:nvPr/>
</p:nvGrpSpPr>
<p:grpSpPr/>
<!-- Shape 1: Normal text -->
<p:sp>
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
<p:spPr/>
<p:txBody>
<a:bodyPr/><a:lstStyle/>
<a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
</p:txBody>
</p:sp>
<!-- Shape 2: Image placeholder (NO txBody) -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="10" name="Picture Placeholder"/>
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
<p:nvPr><p:ph type="pic"/></p:nvPr>
</p:nvSpPr>
<p:spPr/>
</p:sp>
<!-- Shape 3: Another text shape -->
<p:sp>
<p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
<p:spPr/>
<p:txBody>
<a:bodyPr/><a:lstStyle/>
<a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
</p:txBody>
</p:sp>
<!-- Shape 4: Chart placeholder (NO txBody) -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="11" name="Chart Placeholder"/>
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
<p:nvPr><p:ph type="chart"/></p:nvPr>
</p:nvSpPr>
<p:spPr/>
</p:sp>
<!-- Shape 5: Content placeholder (NO txBody - empty) -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="12" name="Content Placeholder"/>
<p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
<p:nvPr><p:ph type="body"/></p:nvPr>
</p:nvSpPr>
<p:spPr/>
</p:sp>
<!-- Shape 6: Final text shape -->
<p:sp>
<p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
<p:spPr/>
<p:txBody>
<a:bodyPr/><a:lstStyle/>
<a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#,
)
.expect("Operation failed");
// Add empty rels
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
// All three text shapes should be extracted
assert!(
extraction.content.contains("First Text Shape"),
"Should extract first text shape"
);
assert!(
extraction.content.contains("Second Text Shape"),
"Should extract second text shape (after image placeholder)"
);
assert!(
extraction.content.contains("Third Text Shape"),
"Should extract third text shape (after multiple placeholders)"
);
println!("✅ PPTX mixed shapes extraction succeeded!");
println!(" All text shapes extracted despite image/chart/content placeholders without txBody");
}
Err(e) => {
panic!(
"PPTX extraction failed: {:?}\n\
Shapes without txBody should be skipped gracefully.",
e
);
}
}
}
/// Test that images extracted from PPTX have correct page numbers.
///
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
/// should have page_number=1 (not reversed).
///
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
#[tokio::test]
async fn test_pptx_image_page_numbers_not_reversed() {
// Create a PPTX with 2 slides, image on slide 1
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
// A minimal 1x1 red PNG image (valid PNG format)
let png_image: &[u8] = &[
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
0x49, 0x48, 0x44, 0x52, // "IHDR"
0x00, 0x00, 0x00, 0x01, // width: 1
0x00, 0x00, 0x00, 0x01, // height: 1
0x08, 0x02, // bit depth: 8, color type: RGB
0x00, 0x00, 0x00, // compression, filter, interlace
0x90, 0x77, 0x53, 0xDE, // IHDR CRC
0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
0x49, 0x44, 0x41, 0x54, // "IDAT"
0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
0x01, 0x01, 0x01, 0x00, // checksum
0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
0x00, 0x00, 0x00, 0x00, // IEND chunk length
0x49, 0x45, 0x4E, 0x44, // "IEND"
0xAE, 0x42, 0x60, 0x82, // IEND CRC
];
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Default Extension="png" ContentType="image/png"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
<Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst>
<p:sldId id="256" r:id="rId2"/>
<p:sldId id="257" r:id="rId3"/>
</p:sldIdLst>
</p:presentation>"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
// BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
// This is valid XML - PowerPoint doesn't guarantee order in rels files
// GitHub Issue #329: This causes page numbers to be reversed
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");
// Add the image file
zip.start_file("ppt/media/image1.png", options)
.expect("Operation failed");
zip.write_all(png_image).expect("Operation failed");
// Add slide 1 WITH an image
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
<p:grpSpPr/>
<p:sp>
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
<p:spPr/>
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
</p:sp>
<p:pic>
<p:nvPicPr>
<p:cNvPr id="3" name="Picture 1"/>
<p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
<p:nvPr/>
</p:nvPicPr>
<p:blipFill>
<a:blip r:embed="rId2"/>
<a:stretch><a:fillRect/></a:stretch>
</p:blipFill>
<p:spPr>
<a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
</p:pic>
</p:spTree>
</p:cSld>
</p:sld>"#,
)
.expect("Operation failed");
// Add slide 1 relationships (points to the image)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
</Relationships>"#).expect("Operation failed");
// Add slide 2 WITHOUT an image
zip.start_file("ppt/slides/slide2.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
<p:grpSpPr/>
<p:sp>
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
<p:spPr/>
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#,
)
.expect("Operation failed");
// Add empty slide 2 relationships
zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
// Extract with images enabled
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&config,
)
.await;
match result {
Ok(extraction) => {
// Verify text extraction works
assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
// Verify we got an image
let images = extraction.images.as_ref().expect("Images should be present");
assert!(!images.is_empty(), "Should extract at least one image");
// THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
let image = &images[0];
assert_eq!(
image.page_number,
Some(1),
"GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
The page numbers are reversed!",
image.page_number
);
println!("✅ PPTX image page numbers are correct!");
println!(" Image on slide 1 has page_number={:?}", image.page_number);
}
Err(e) => {
panic!("PPTX extraction failed: {:?}", e);
}
}
}
/// Test with actual user-provided PPTX file from GitHub Issue #329.
///
/// The user's file has slides listed in reverse order in presentation.xml.rels,
/// which caused images to have incorrect page numbers.
#[tokio::test]
async fn test_pptx_image_page_numbers_issue329_user_file() {
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed");
let test_file = workspace_root.join("test_documents/pptx/pptx_reversed_slide_order_issue329.pptx");
if !test_file.exists() {
println!("Skipping test: User file not found at {:?}", test_file);
return;
}
// Extract with images enabled
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file(&test_file, None, &config).await;
match result {
Ok(extraction) => {
// The user's file has an image on slide 1
let images = extraction.images.as_ref().expect("Images should be extracted");
if images.is_empty() {
println!("No images extracted from user file (may not have embedded images)");
return;
}
// All images should have page_number = 1 since they're on the first slide
for (idx, image) in images.iter().enumerate() {
assert_eq!(
image.page_number,
Some(1),
"GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
idx,
image.page_number
);
}
println!("✅ User file from Issue #329 - image page numbers correct!");
println!(" Found {} images, all with page_number=1", images.len());
}
Err(e) => {
panic!("Failed to extract user file: {:?}", e);
}
}
}
/// Test that a `<pic>` element whose `<a:blip>` lacks `r:embed` is skipped
/// gracefully instead of failing the entire page.
///
/// This is a regression test for PR #1016: before the fix, `parse_pic(node)?`
/// propagated the error, aborting extraction of the whole slide. After the fix,
/// the broken image is logged and skipped while the rest of the slide content
/// is preserved.
#[tokio::test]
async fn test_pptx_broken_image_blip_missing_embed_skipped_gracefully() {
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
{
let mut zip = ZipWriter::new(&mut temp_file);
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
// Add [Content_Types].xml
zip.start_file("[Content_Types].xml", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");
// Add _rels/.rels
zip.start_file("_rels/.rels", options).expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/presentation.xml
zip.start_file("ppt/presentation.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<p:sldIdLst>
<p:sldId id="256" r:id="rId2"/>
</p:sldIdLst>
</p:presentation>"#,
)
.expect("Operation failed");
// Add ppt/_rels/presentation.xml.rels
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.expect("Operation failed");
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");
// Add ppt/slides/slide1.xml
// KEY TEST CASE: <a:blip> inside <p:pic> does NOT have r:embed attribute.
// The slide also has text shapes before and after the broken image.
zip.start_file("ppt/slides/slide1.xml", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:nvGrpSpPr>
<p:cNvPr id="1" name=""/>
<p:cNvGrpSpPr/>
<p:nvPr/>
</p:nvGrpSpPr>
<p:grpSpPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="0" cy="0"/>
<a:chOff x="0" y="0"/>
<a:chExt cx="0" cy="0"/>
</a:xfrm>
</p:grpSpPr>
<!-- Normal text shape BEFORE the broken image - must be preserved -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="2" name="Title"/>
<p:cNvSpPr/>
<p:nvPr/>
</p:nvSpPr>
<p:spPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
<p:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:r>
<a:rPr lang="en-US"/>
<a:t>Text before broken image</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
<!-- BROKEN IMAGE: <a:blip> has NO r:embed attribute -->
<!-- Before the fix, this would abort the entire page with:
"Image embed attribute not found" -->
<p:pic>
<p:nvPicPr>
<p:cNvPr id="4" name="Broken Image"/>
<p:cNvPicPr>
<a:picLocks noChangeAspect="1"/>
</p:cNvPicPr>
<p:nvPr/>
</p:nvPicPr>
<p:blipFill>
<a:blip/>
<a:stretch>
<a:fillRect/>
</a:stretch>
</p:blipFill>
<p:spPr>
<a:xfrm>
<a:off x="0" y="200000"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect">
<a:avLst/>
</a:prstGeom>
</p:spPr>
</p:pic>
<!-- Normal text shape AFTER the broken image - must be preserved -->
<p:sp>
<p:nvSpPr>
<p:cNvPr id="3" name="Content"/>
<p:cNvSpPr/>
<p:nvPr/>
</p:nvSpPr>
<p:spPr>
<a:xfrm>
<a:off x="0" y="400000"/>
<a:ext cx="100000" cy="100000"/>
</a:xfrm>
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
</p:spPr>
<p:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:r>
<a:rPr lang="en-US"/>
<a:t>Text after broken image</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#,
)
.expect("Operation failed");
// Add ppt/slides/_rels/slide1.xml.rels (empty - no image rels needed for this test)
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.expect("Operation failed");
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.expect("Operation failed");
zip.finish().expect("Operation failed");
}
let result = extract_file(
temp_file.path(),
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
&ExtractionConfig::default(),
)
.await;
match result {
Ok(extraction) => {
assert!(!extraction.content.is_empty(), "Content should not be empty");
// Verify text BEFORE the broken image is extracted
assert!(
extraction.content.contains("Text before broken image"),
"Should preserve text before broken image. Got: {}",
extraction.content
);
// Verify text AFTER the broken image is extracted
assert!(
extraction.content.contains("Text after broken image"),
"Should preserve text after broken image. Got: {}",
extraction.content
);
println!("✅ PPTX with broken image (blip missing r:embed) extraction succeeded!");
println!(" Content: {}", extraction.content);
}
Err(e) => {
let error_msg = format!("{:?}", e);
if error_msg.contains("Image embed attribute not found") {
panic!(
"PPTX extraction failed with 'Image embed attribute not found' error!\n\
This is the regression for PR #1016.\n\
The parser should skip <pic> elements whose <a:blip> lacks r:embed\n\
instead of failing the entire page.\n\
Error: {:?}",
e
);
} else {
panic!("PPTX extraction failed with unexpected error: {:?}", e);
}
}
}
}