fil/crates/kreuzberg/tests/pptx_regression_tests.rs

//! Regression tests for PPTX/PPSX extraction bugs
//!
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
//!
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2

#![cfg(feature = "office")]

use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
use std::io::Write;
use tempfile::NamedTempFile;
use zip::CompressionMethod;
use zip::write::{FileOptions, ZipWriter};

/// Test that PPSX (PowerPoint Show) files are extracted correctly.
///
/// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
/// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
///
/// The internal structure is identical to PPTX - same slide XML format.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_slideshow_extraction() {
    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .expect("Operation failed")
        .parent()
        .expect("Operation failed");
    let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");

    if !test_file.exists() {
        println!("Skipping test: PPSX test file not found at {:?}", test_file);
        return;
    }

    let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;

    match result {
        Ok(extraction) => {
            assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
            println!("✅ PPSX extraction succeeded!");
            println!("   Content length: {} chars", extraction.content.len());
            println!(
                "   Content preview: {}",
                &extraction.content[..extraction.content.len().min(200)]
            );
        }
        Err(e) => {
            panic!(
                "PPSX extraction failed with error: {:?}\n\
                 This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
                 PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
                 needs to be added to extension-to-MIME mapping.",
                e
            );
        }
    }
}

/// Test that PPSX files can be extracted when MIME type is explicitly provided.
///
/// This validates that the PPTX extractor can handle PPSX content correctly
/// (the XML structure is identical), even if MIME detection fails.
///
/// GitHub Issue #321 Bug 2
#[tokio::test]
async fn test_ppsx_with_explicit_mime_type() {
    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .expect("Operation failed")
        .parent()
        .expect("Operation failed");
    let test_file = workspace_root.join("test_documents/pptx/sample.ppsx");

    if !test_file.exists() {
        println!("Skipping test: PPSX test file not found at {:?}", test_file);
        return;
    }

    // Explicitly provide the PPSX MIME type
    let result = extract_file(
        &test_file,
        Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
        &ExtractionConfig::default(),
    )
    .await;

    match result {
        Ok(extraction) => {
            assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
            println!("✅ PPSX extraction with explicit MIME type succeeded!");
        }
        Err(e) => {
            panic!(
                "PPSX extraction with explicit MIME type failed: {:?}\n\
                 The PPTX extractor should handle PPSX content (identical XML structure).",
                e
            );
        }
    }
}

/// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
///
/// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
/// `<p:txBody>` children because they're designed to hold images, not text.
///
/// The parser should skip shapes without txBody gracefully instead of failing.
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_with_image_placeholder_no_txbody() {
    // Create a minimal PPTX with a shape that has no txBody (image placeholder)
    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");

    {
        let mut zip = ZipWriter::new(&mut temp_file);
        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);

        // Add [Content_Types].xml
        zip.start_file("[Content_Types].xml", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");

        // Add _rels/.rels
        zip.start_file("_rels/.rels", options).expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/presentation.xml
        zip.start_file("ppt/presentation.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
  <p:sldIdLst>
    <p:sldId id="256" r:id="rId2"/>
  </p:sldIdLst>
</p:presentation>"#,
        )
        .expect("Operation failed");

        // Add ppt/_rels/presentation.xml.rels
        zip.start_file("ppt/_rels/presentation.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
        // This is the critical test case - a <p:sp> element with no <p:txBody>
        zip.start_file("ppt/slides/slide1.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:cSld>
    <p:spTree>
      <p:nvGrpSpPr>
        <p:cNvPr id="1" name=""/>
        <p:cNvGrpSpPr/>
        <p:nvPr/>
      </p:nvGrpSpPr>
      <p:grpSpPr>
        <a:xfrm>
          <a:off x="0" y="0"/>
          <a:ext cx="0" cy="0"/>
          <a:chOff x="0" y="0"/>
          <a:chExt cx="0" cy="0"/>
        </a:xfrm>
      </p:grpSpPr>

      <!-- Normal text shape WITH txBody - this should be extracted -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="2" name="Title"/>
          <p:cNvSpPr/>
          <p:nvPr/>
        </p:nvSpPr>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="0"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
        <p:txBody>
          <a:bodyPr/>
          <a:lstStyle/>
          <a:p>
            <a:r>
              <a:rPr lang="en-US"/>
              <a:t>This is the title text</a:t>
            </a:r>
          </a:p>
        </p:txBody>
      </p:sp>

      <!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
      <!-- This is a valid PPTX structure - image placeholders don't contain text -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="99" name="Image Placeholder"/>
          <p:cNvSpPr>
            <a:spLocks noGrp="1"/>
          </p:cNvSpPr>
          <p:nvPr>
            <p:ph type="pic" idx="1"/>
          </p:nvPr>
        </p:nvSpPr>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="0"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
        <!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
      </p:sp>

      <!-- Another normal text shape - should also be extracted -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="3" name="Content"/>
          <p:cNvSpPr/>
          <p:nvPr/>
        </p:nvSpPr>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="200000"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
        <p:txBody>
          <a:bodyPr/>
          <a:lstStyle/>
          <a:p>
            <a:r>
              <a:rPr lang="en-US"/>
              <a:t>Content after image placeholder</a:t>
            </a:r>
          </a:p>
        </p:txBody>
      </p:sp>

    </p:spTree>
  </p:cSld>
</p:sld>"#,
        )
        .expect("Operation failed");

        // Add ppt/slides/_rels/slide1.xml.rels (empty)
        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
        )
        .expect("Operation failed");

        zip.finish().expect("Operation failed");
    }

    // Extract the PPTX file
    let result = extract_file(
        temp_file.path(),
        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
        &ExtractionConfig::default(),
    )
    .await;

    match result {
        Ok(extraction) => {
            assert!(!extraction.content.is_empty(), "Content should not be empty");

            // Verify we extracted text from shapes that DO have txBody
            assert!(
                extraction.content.contains("title text"),
                "Should extract text from first shape with txBody. Got: {}",
                extraction.content
            );
            assert!(
                extraction.content.contains("Content after"),
                "Should extract text from shape after image placeholder. Got: {}",
                extraction.content
            );

            println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
            println!("   Content: {}", extraction.content);
        }
        Err(e) => {
            let error_msg = format!("{:?}", e);
            if error_msg.contains("No txBody found") {
                panic!(
                    "PPTX extraction failed with 'No txBody found' error!\n\
                     This is GitHub Issue #321 Bug 1.\n\
                     The parser should skip shapes without txBody (image placeholders) \
                     instead of failing.\n\
                     Error: {:?}",
                    e
                );
            } else {
                panic!("PPTX extraction failed with unexpected error: {:?}", e);
            }
        }
    }
}

/// Test extraction of PPTX with multiple shapes, some with txBody, some without.
///
/// This test verifies that:
/// 1. Shapes WITH txBody are extracted
/// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
/// 3. The extraction continues and doesn't fail on the first shape without txBody
///
/// GitHub Issue #321 Bug 1
#[tokio::test]
async fn test_pptx_mixed_shapes_extraction() {
    // Create a PPTX with multiple slides, each containing mixed shapes
    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");

    {
        let mut zip = ZipWriter::new(&mut temp_file);
        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);

        // Add [Content_Types].xml
        zip.start_file("[Content_Types].xml", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");

        // Add _rels/.rels
        zip.start_file("_rels/.rels", options).expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/presentation.xml
        zip.start_file("ppt/presentation.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
  <p:sldIdLst>
    <p:sldId id="256" r:id="rId2"/>
  </p:sldIdLst>
</p:presentation>"#,
        )
        .expect("Operation failed");

        // Add ppt/_rels/presentation.xml.rels
        zip.start_file("ppt/_rels/presentation.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add slide with various shapes - some with txBody, some without
        zip.start_file("ppt/slides/slide1.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:cSld>
    <p:spTree>
      <p:nvGrpSpPr>
        <p:cNvPr id="1" name=""/>
        <p:cNvGrpSpPr/>
        <p:nvPr/>
      </p:nvGrpSpPr>
      <p:grpSpPr/>

      <!-- Shape 1: Normal text -->
      <p:sp>
        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
        <p:spPr/>
        <p:txBody>
          <a:bodyPr/><a:lstStyle/>
          <a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
        </p:txBody>
      </p:sp>

      <!-- Shape 2: Image placeholder (NO txBody) -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="10" name="Picture Placeholder"/>
          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
          <p:nvPr><p:ph type="pic"/></p:nvPr>
        </p:nvSpPr>
        <p:spPr/>
      </p:sp>

      <!-- Shape 3: Another text shape -->
      <p:sp>
        <p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
        <p:spPr/>
        <p:txBody>
          <a:bodyPr/><a:lstStyle/>
          <a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
        </p:txBody>
      </p:sp>

      <!-- Shape 4: Chart placeholder (NO txBody) -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="11" name="Chart Placeholder"/>
          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
          <p:nvPr><p:ph type="chart"/></p:nvPr>
        </p:nvSpPr>
        <p:spPr/>
      </p:sp>

      <!-- Shape 5: Content placeholder (NO txBody - empty) -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="12" name="Content Placeholder"/>
          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
          <p:nvPr><p:ph type="body"/></p:nvPr>
        </p:nvSpPr>
        <p:spPr/>
      </p:sp>

      <!-- Shape 6: Final text shape -->
      <p:sp>
        <p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
        <p:spPr/>
        <p:txBody>
          <a:bodyPr/><a:lstStyle/>
          <a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
        </p:txBody>
      </p:sp>

    </p:spTree>
  </p:cSld>
</p:sld>"#,
        )
        .expect("Operation failed");

        // Add empty rels
        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
        )
        .expect("Operation failed");

        zip.finish().expect("Operation failed");
    }

    let result = extract_file(
        temp_file.path(),
        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
        &ExtractionConfig::default(),
    )
    .await;

    match result {
        Ok(extraction) => {
            // All three text shapes should be extracted
            assert!(
                extraction.content.contains("First Text Shape"),
                "Should extract first text shape"
            );
            assert!(
                extraction.content.contains("Second Text Shape"),
                "Should extract second text shape (after image placeholder)"
            );
            assert!(
                extraction.content.contains("Third Text Shape"),
                "Should extract third text shape (after multiple placeholders)"
            );

            println!("✅ PPTX mixed shapes extraction succeeded!");
            println!("   All text shapes extracted despite image/chart/content placeholders without txBody");
        }
        Err(e) => {
            panic!(
                "PPTX extraction failed: {:?}\n\
                 Shapes without txBody should be skipped gracefully.",
                e
            );
        }
    }
}

/// Test that images extracted from PPTX have correct page numbers.
///
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
/// should have page_number=1 (not reversed).
///
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
#[tokio::test]
async fn test_pptx_image_page_numbers_not_reversed() {
    // Create a PPTX with 2 slides, image on slide 1
    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");

    // A minimal 1x1 red PNG image (valid PNG format)
    let png_image: &[u8] = &[
        0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
        0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
        0x49, 0x48, 0x44, 0x52, // "IHDR"
        0x00, 0x00, 0x00, 0x01, // width: 1
        0x00, 0x00, 0x00, 0x01, // height: 1
        0x08, 0x02, // bit depth: 8, color type: RGB
        0x00, 0x00, 0x00, // compression, filter, interlace
        0x90, 0x77, 0x53, 0xDE, // IHDR CRC
        0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
        0x49, 0x44, 0x41, 0x54, // "IDAT"
        0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
        0x01, 0x01, 0x01, 0x00, // checksum
        0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
        0x00, 0x00, 0x00, 0x00, // IEND chunk length
        0x49, 0x45, 0x4E, 0x44, // "IEND"
        0xAE, 0x42, 0x60, 0x82, // IEND CRC
    ];

    {
        let mut zip = ZipWriter::new(&mut temp_file);
        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);

        // Add [Content_Types].xml
        zip.start_file("[Content_Types].xml", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Default Extension="png" ContentType="image/png"/>
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
  <Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");

        // Add _rels/.rels
        zip.start_file("_rels/.rels", options).expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/presentation.xml
        zip.start_file("ppt/presentation.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:sldIdLst>
    <p:sldId id="256" r:id="rId2"/>
    <p:sldId id="257" r:id="rId3"/>
  </p:sldIdLst>
</p:presentation>"#,
        )
        .expect("Operation failed");

        // Add ppt/_rels/presentation.xml.rels
        // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
        // This is valid XML - PowerPoint doesn't guarantee order in rels files
        // GitHub Issue #329: This causes page numbers to be reversed
        zip.start_file("ppt/_rels/presentation.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add the image file
        zip.start_file("ppt/media/image1.png", options)
            .expect("Operation failed");
        zip.write_all(png_image).expect("Operation failed");

        // Add slide 1 WITH an image
        zip.start_file("ppt/slides/slide1.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:cSld>
    <p:spTree>
      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
      <p:grpSpPr/>
      <p:sp>
        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
        <p:spPr/>
        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
      </p:sp>
      <p:pic>
        <p:nvPicPr>
          <p:cNvPr id="3" name="Picture 1"/>
          <p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
          <p:nvPr/>
        </p:nvPicPr>
        <p:blipFill>
          <a:blip r:embed="rId2"/>
          <a:stretch><a:fillRect/></a:stretch>
        </p:blipFill>
        <p:spPr>
          <a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
      </p:pic>
    </p:spTree>
  </p:cSld>
</p:sld>"#,
        )
        .expect("Operation failed");

        // Add slide 1 relationships (points to the image)
        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
</Relationships>"#).expect("Operation failed");

        // Add slide 2 WITHOUT an image
        zip.start_file("ppt/slides/slide2.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:cSld>
    <p:spTree>
      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
      <p:grpSpPr/>
      <p:sp>
        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
        <p:spPr/>
        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
      </p:sp>
    </p:spTree>
  </p:cSld>
</p:sld>"#,
        )
        .expect("Operation failed");

        // Add empty slide 2 relationships
        zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
        )
        .expect("Operation failed");

        zip.finish().expect("Operation failed");
    }

    // Extract with images enabled
    let config = ExtractionConfig {
        images: Some(ImageExtractionConfig {
            extract_images: true,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file(
        temp_file.path(),
        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
        &config,
    )
    .await;

    match result {
        Ok(extraction) => {
            // Verify text extraction works
            assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
            assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");

            // Verify we got an image
            let images = extraction.images.as_ref().expect("Images should be present");
            assert!(!images.is_empty(), "Should extract at least one image");

            // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
            let image = &images[0];
            assert_eq!(
                image.page_number,
                Some(1),
                "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
                 The page numbers are reversed!",
                image.page_number
            );

            println!("✅ PPTX image page numbers are correct!");
            println!("   Image on slide 1 has page_number={:?}", image.page_number);
        }
        Err(e) => {
            panic!("PPTX extraction failed: {:?}", e);
        }
    }
}

/// Test with actual user-provided PPTX file from GitHub Issue #329.
///
/// The user's file has slides listed in reverse order in presentation.xml.rels,
/// which caused images to have incorrect page numbers.
#[tokio::test]
async fn test_pptx_image_page_numbers_issue329_user_file() {
    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .expect("Operation failed")
        .parent()
        .expect("Operation failed");
    let test_file = workspace_root.join("test_documents/pptx/pptx_reversed_slide_order_issue329.pptx");

    if !test_file.exists() {
        println!("Skipping test: User file not found at {:?}", test_file);
        return;
    }

    // Extract with images enabled
    let config = ExtractionConfig {
        images: Some(ImageExtractionConfig {
            extract_images: true,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file(&test_file, None, &config).await;

    match result {
        Ok(extraction) => {
            // The user's file has an image on slide 1
            let images = extraction.images.as_ref().expect("Images should be extracted");

            if images.is_empty() {
                println!("No images extracted from user file (may not have embedded images)");
                return;
            }

            // All images should have page_number = 1 since they're on the first slide
            for (idx, image) in images.iter().enumerate() {
                assert_eq!(
                    image.page_number,
                    Some(1),
                    "GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
                    idx,
                    image.page_number
                );
            }

            println!("✅ User file from Issue #329 - image page numbers correct!");
            println!("   Found {} images, all with page_number=1", images.len());
        }
        Err(e) => {
            panic!("Failed to extract user file: {:?}", e);
        }
    }
}

/// Test that a `<pic>` element whose `<a:blip>` lacks `r:embed` is skipped
/// gracefully instead of failing the entire page.
///
/// This is a regression test for PR #1016: before the fix, `parse_pic(node)?`
/// propagated the error, aborting extraction of the whole slide. After the fix,
/// the broken image is logged and skipped while the rest of the slide content
/// is preserved.
#[tokio::test]
async fn test_pptx_broken_image_blip_missing_embed_skipped_gracefully() {
    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");

    {
        let mut zip = ZipWriter::new(&mut temp_file);
        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);

        // Add [Content_Types].xml
        zip.start_file("[Content_Types].xml", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
  <Default Extension="xml" ContentType="application/xml"/>
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).expect("Operation failed");

        // Add _rels/.rels
        zip.start_file("_rels/.rels", options).expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/presentation.xml
        zip.start_file("ppt/presentation.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
  <p:sldIdLst>
    <p:sldId id="256" r:id="rId2"/>
  </p:sldIdLst>
</p:presentation>"#,
        )
        .expect("Operation failed");

        // Add ppt/_rels/presentation.xml.rels
        zip.start_file("ppt/_rels/presentation.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).expect("Operation failed");

        // Add ppt/slides/slide1.xml
        // KEY TEST CASE: <a:blip> inside <p:pic> does NOT have r:embed attribute.
        // The slide also has text shapes before and after the broken image.
        zip.start_file("ppt/slides/slide1.xml", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
  <p:cSld>
    <p:spTree>
      <p:nvGrpSpPr>
        <p:cNvPr id="1" name=""/>
        <p:cNvGrpSpPr/>
        <p:nvPr/>
      </p:nvGrpSpPr>
      <p:grpSpPr>
        <a:xfrm>
          <a:off x="0" y="0"/>
          <a:ext cx="0" cy="0"/>
          <a:chOff x="0" y="0"/>
          <a:chExt cx="0" cy="0"/>
        </a:xfrm>
      </p:grpSpPr>

      <!-- Normal text shape BEFORE the broken image - must be preserved -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="2" name="Title"/>
          <p:cNvSpPr/>
          <p:nvPr/>
        </p:nvSpPr>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="0"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
        <p:txBody>
          <a:bodyPr/>
          <a:lstStyle/>
          <a:p>
            <a:r>
              <a:rPr lang="en-US"/>
              <a:t>Text before broken image</a:t>
            </a:r>
          </a:p>
        </p:txBody>
      </p:sp>

      <!-- BROKEN IMAGE: <a:blip> has NO r:embed attribute -->
      <!-- Before the fix, this would abort the entire page with:
           "Image embed attribute not found" -->
      <p:pic>
        <p:nvPicPr>
          <p:cNvPr id="4" name="Broken Image"/>
          <p:cNvPicPr>
            <a:picLocks noChangeAspect="1"/>
          </p:cNvPicPr>
          <p:nvPr/>
        </p:nvPicPr>
        <p:blipFill>
          <a:blip/>
          <a:stretch>
            <a:fillRect/>
          </a:stretch>
        </p:blipFill>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="200000"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect">
            <a:avLst/>
          </a:prstGeom>
        </p:spPr>
      </p:pic>

      <!-- Normal text shape AFTER the broken image - must be preserved -->
      <p:sp>
        <p:nvSpPr>
          <p:cNvPr id="3" name="Content"/>
          <p:cNvSpPr/>
          <p:nvPr/>
        </p:nvSpPr>
        <p:spPr>
          <a:xfrm>
            <a:off x="0" y="400000"/>
            <a:ext cx="100000" cy="100000"/>
          </a:xfrm>
          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
        </p:spPr>
        <p:txBody>
          <a:bodyPr/>
          <a:lstStyle/>
          <a:p>
            <a:r>
              <a:rPr lang="en-US"/>
              <a:t>Text after broken image</a:t>
            </a:r>
          </a:p>
        </p:txBody>
      </p:sp>

    </p:spTree>
  </p:cSld>
</p:sld>"#,
        )
        .expect("Operation failed");

        // Add ppt/slides/_rels/slide1.xml.rels (empty - no image rels needed for this test)
        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
            .expect("Operation failed");
        zip.write_all(
            br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
        )
        .expect("Operation failed");

        zip.finish().expect("Operation failed");
    }

    let result = extract_file(
        temp_file.path(),
        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
        &ExtractionConfig::default(),
    )
    .await;

    match result {
        Ok(extraction) => {
            assert!(!extraction.content.is_empty(), "Content should not be empty");

            // Verify text BEFORE the broken image is extracted
            assert!(
                extraction.content.contains("Text before broken image"),
                "Should preserve text before broken image. Got: {}",
                extraction.content
            );

            // Verify text AFTER the broken image is extracted
            assert!(
                extraction.content.contains("Text after broken image"),
                "Should preserve text after broken image. Got: {}",
                extraction.content
            );

            println!("✅ PPTX with broken image (blip missing r:embed) extraction succeeded!");
            println!("   Content: {}", extraction.content);
        }
        Err(e) => {
            let error_msg = format!("{:?}", e);
            if error_msg.contains("Image embed attribute not found") {
                panic!(
                    "PPTX extraction failed with 'Image embed attribute not found' error!\n\
                     This is the regression for PR #1016.\n\
                     The parser should skip <pic> elements whose <a:blip> lacks r:embed\n\
                     instead of failing the entire page.\n\
                     Error: {:?}",
                    e
                );
            } else {
                panic!("PPTX extraction failed with unexpected error: {:?}", e);
            }
        }
    }
}