fil/crates/kreuzberg/tests/jats_extractor_tests.rs

//! Comprehensive test suite for the JATS (Journal Article Tag Suite) extractor.
//!
//! This test suite validates all aspects of the JATS extractor including:
//! - Metadata extraction (title, authors, affiliations, DOI, keywords, dates)
//! - Article content extraction (sections, paragraphs)
//! - Table extraction with proper structure
//! - Citation handling
//! - Edge cases and error handling

#[cfg(all(test, feature = "xml"))]
mod jats_extractor_tests {
    use kreuzberg::core::config::ExtractionConfig;
    use kreuzberg::extraction::derive::derive_extraction_result;
    use kreuzberg::extractors::JatsExtractor;
    use kreuzberg::plugins::{DocumentExtractor, Plugin};
    use std::path::PathBuf;

    fn jats_fixture(name: &str) -> PathBuf {
        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("../../test_documents/jats")
            .join(name)
    }

    /// Test basic JATS article extraction with all key metadata fields
    #[tokio::test]
    async fn test_extract_complete_jats_article() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
  <front>
    <article-meta>
      <article-title>Effects of Caffeine on Human Health</article-title>
      <subtitle>A Systematic Review and Meta-Analysis</subtitle>
      <contrib-group>
        <contrib contrib-type="author">
          <name>
            <surname>Smith</surname>
            <given-names>John A.</given-names>
          </name>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Johnson</surname>
            <given-names>Jane B.</given-names>
          </name>
        </contrib>
      </contrib-group>
      <aff id="aff1">Department of Medicine, Harvard University, Cambridge, MA</aff>
      <article-id pub-id-type="doi">10.1371/journal.pmed.0020124</article-id>
      <article-id pub-id-type="pii">05-PLME-RA-0071R2</article-id>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>04</month>
        <year>2005</year>
      </pub-date>
      <volume>2</volume>
      <issue>4</issue>
      <fpage>e124</fpage>
      <lpage>e132</lpage>
      <kwd-group>
        <kwd>caffeine</kwd>
        <kwd>meta-analysis</kwd>
        <kwd>systematic review</kwd>
      </kwd-group>
      <abstract>
        <sec>
          <title>Background</title>
          <p>Caffeine is one of the most widely consumed psychoactive substances.</p>
        </sec>
      </abstract>
    </article-meta>
  </front>
  <body>
    <sec id="s1">
      <title>Introduction</title>
      <p>This review examines the evidence for effects of caffeine.</p>
    </sec>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.content.contains("Effects of Caffeine"));
        assert!(extraction.content.contains("Introduction"));

        assert!(extraction.metadata.subject.is_some());
        let subject = extraction.metadata.subject.expect("Operation failed");
        assert!(subject.contains("Effects of Caffeine"));

        assert!(subject.contains("10.1371"));

        assert!(subject.contains("caffeine") || subject.contains("Keywords"));
    }

    /// Test extraction of rich metadata including all author and affiliation data
    #[tokio::test]
    async fn test_extract_rich_author_affiliation_metadata() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Advanced Study</article-title>
      <contrib-group>
        <contrib contrib-type="author">
          <name>
            <surname>Alpha</surname>
            <given-names>First</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
          <xref ref-type="aff" rid="aff2">2</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Beta</surname>
            <given-names>Second</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">1</xref>
        </contrib>
        <contrib contrib-type="author">
          <name>
            <surname>Gamma</surname>
            <given-names>Third</given-names>
          </name>
          <xref ref-type="aff" rid="aff3">3</xref>
          <role>Correspondence</role>
        </contrib>
      </contrib-group>
      <aff id="aff1"><label>1</label>Department of Science, University A, City A</aff>
      <aff id="aff2"><label>2</label>Research Institute, City B</aff>
      <aff id="aff3"><label>3</label>Medical Center, City C</aff>
    </article-meta>
  </front>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        let subject = extraction.metadata.subject.expect("Operation failed");
        assert!(subject.contains("Alpha"));
        assert!(subject.contains("Beta"));
        assert!(subject.contains("Gamma"));
        assert!(subject.contains("Department of Science"));
    }

    /// Test section hierarchy extraction in article body
    #[tokio::test]
    async fn test_extract_section_hierarchy() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Research Article</article-title>
    </article-meta>
  </front>
  <body>
    <sec id="s1">
      <title>Introduction</title>
      <p>Introduction content here.</p>
    </sec>
    <sec id="s2">
      <title>Methods</title>
      <sec id="s2a">
        <title>Study Design</title>
        <p>Design content here.</p>
      </sec>
      <sec id="s2b">
        <title>Participants</title>
        <p>Participant content here.</p>
      </sec>
    </sec>
    <sec id="s3">
      <title>Results</title>
      <p>Results content here.</p>
    </sec>
    <sec id="s4">
      <title>Discussion</title>
      <p>Discussion content here.</p>
    </sec>
    <sec id="s5">
      <title>Conclusions</title>
      <p>Conclusion content here.</p>
    </sec>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.content.contains("Introduction"));
        assert!(extraction.content.contains("Methods"));
        assert!(extraction.content.contains("Results"));
        assert!(extraction.content.contains("Discussion"));
        assert!(extraction.content.contains("Conclusions"));
        assert!(extraction.content.contains("Study Design"));
        assert!(extraction.content.contains("Participants"));
    }

    /// Test table extraction with headers and data rows
    #[tokio::test]
    async fn test_extract_tables_with_captions() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Study Results</article-title>
    </article-meta>
  </front>
  <body>
    <sec id="s1">
      <title>Results</title>
      <table-wrap id="tbl1">
        <label>Table 1</label>
        <caption>
          <title>Characteristics of Study Population</title>
          <p>Baseline characteristics of enrolled subjects.</p>
        </caption>
        <table>
          <thead>
            <tr>
              <th>Parameter</th>
              <th>Group A (n=50)</th>
              <th>Group B (n=50)</th>
              <th>P-value</th>
            </tr>
          </thead>
          <tbody>
            <tr>
              <td>Age (years)</td>
              <td>45.3 ± 8.2</td>
              <td>44.9 ± 7.8</td>
              <td>0.45</td>
            </tr>
            <tr>
              <td>Sex (M/F)</td>
              <td>28/22</td>
              <td>26/24</td>
              <td>0.58</td>
            </tr>
            <tr>
              <td>BMI (kg/m²)</td>
              <td>25.1 ± 3.2</td>
              <td>24.8 ± 2.9</td>
              <td>0.62</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert_eq!(extraction.tables.len(), 1);
        let table = &extraction.tables[0];

        assert!(table.cells.len() >= 3);
        assert_eq!(table.cells[0].len(), 4);

        assert!(table.cells[0][0].contains("Parameter"));
        assert!(table.cells[1][0].contains("Age"));
        assert!(table.cells[2][0].contains("Sex"));
    }

    /// Test multiple tables extraction
    #[tokio::test]
    async fn test_extract_multiple_tables() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Multi-Table Article</article-title>
    </article-meta>
  </front>
  <body>
    <table-wrap id="tbl1">
      <table>
        <thead>
          <tr><th>A</th><th>B</th></tr>
        </thead>
        <tbody>
          <tr><td>1</td><td>2</td></tr>
        </tbody>
      </table>
    </table-wrap>
    <table-wrap id="tbl2">
      <table>
        <thead>
          <tr><th>X</th><th>Y</th><th>Z</th></tr>
        </thead>
        <tbody>
          <tr><td>a</td><td>b</td><td>c</td></tr>
        </tbody>
      </table>
    </table-wrap>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert_eq!(extraction.tables.len(), 2);
        assert_eq!(extraction.tables[0].cells[0].len(), 2);
        assert_eq!(extraction.tables[1].cells[0].len(), 3);
    }

    /// Test citation extraction in text with xref elements
    #[tokio::test]
    async fn test_extract_citations_in_text() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Citation Study</article-title>
    </article-meta>
  </front>
  <body>
    <sec id="s1">
      <title>Introduction</title>
      <p>Previous research has shown effectiveness <xref ref-type="bibr" rid="ref1">1</xref>.
         Other studies confirm this finding <xref ref-type="bibr" rid="ref2">2</xref>.</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <ref id="ref1">
        <element-citation publication-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Author</surname>
              <given-names>First</given-names>
            </name>
          </person-group>
          <article-title>Original Research</article-title>
          <source>Journal Name</source>
          <year>2020</year>
        </element-citation>
      </ref>
      <ref id="ref2">
        <element-citation publication-type="journal">
          <person-group person-group-type="author">
            <name>
              <surname>Researcher</surname>
              <given-names>Second</given-names>
            </name>
          </person-group>
          <article-title>Confirmatory Study</article-title>
          <source>Other Journal</source>
          <year>2021</year>
        </element-citation>
      </ref>
    </ref-list>
  </back>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.content.contains("Previous research"));
        assert!(extraction.content.contains("Other studies"));
    }

    /// Test abstract extraction with structured sections
    #[tokio::test]
    async fn test_extract_structured_abstract() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Abstract Study</article-title>
      <abstract>
        <sec>
          <title>Background</title>
          <p>This is the background information of the study.</p>
        </sec>
        <sec>
          <title>Methods and Findings</title>
          <p>We used quantitative analysis to evaluate the hypothesis.</p>
        </sec>
        <sec>
          <title>Conclusions</title>
          <p>The study provides evidence that the hypothesis is correct.</p>
        </sec>
      </abstract>
    </article-meta>
  </front>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        let subject = extraction.metadata.subject.expect("Operation failed");
        assert!(subject.contains("background") || subject.contains("Background") || subject.contains("Abstract"));
    }

    /// Test corresponding author extraction
    #[tokio::test]
    async fn test_extract_corresponding_author() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Study</article-title>
      <author-notes>
        <corresp id="cor1"><label>*</label>Corresponding author. E-mail: john.smith@example.com</corresp>
      </author-notes>
    </article-meta>
  </front>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.metadata.subject.is_some());
    }

    /// Test publication date extraction in various formats
    #[tokio::test]
    async fn test_extract_publication_date() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Publication Test</article-title>
      <pub-date pub-type="epub">
        <day>15</day>
        <month>06</month>
        <year>2023</year>
      </pub-date>
    </article-meta>
  </front>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.metadata.created_at.is_some());
    }

    /// Test handling of empty/minimal JATS documents
    #[tokio::test]
    async fn test_extract_minimal_jats() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
    </article-meta>
  </front>
  <body>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );
        assert!(extraction.content.is_empty() || extraction.content.trim().is_empty());
    }

    /// Test MIME type support
    #[test]
    fn test_jats_supported_mime_types() {
        let extractor = JatsExtractor;
        let mime_types = extractor.supported_mime_types();

        assert!(mime_types.contains(&"application/x-jats+xml"));
        assert!(mime_types.contains(&"text/jats"));
    }

    /// Test extractor priority value
    #[test]
    fn test_jats_extractor_priority() {
        let extractor = JatsExtractor;
        assert_eq!(extractor.priority(), 50);
    }

    /// Test plugin interface compliance
    #[test]
    fn test_jats_plugin_interface() {
        let extractor = JatsExtractor;
        assert_eq!(extractor.name(), "jats-extractor");
        assert!(!extractor.version().is_empty());
        assert!(extractor.initialize().is_ok());
        assert!(extractor.shutdown().is_ok());
    }

    /// Test mixed content with tables and paragraphs
    #[tokio::test]
    async fn test_extract_mixed_content() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Mixed Content</article-title>
    </article-meta>
  </front>
  <body>
    <sec id="s1">
      <title>Analysis</title>
      <p>First paragraph with data.</p>
      <table-wrap id="tbl1">
        <table>
          <thead>
            <tr><th>Data</th><th>Value</th></tr>
          </thead>
          <tbody>
            <tr><td>Sample</td><td>100</td></tr>
          </tbody>
        </table>
      </table-wrap>
      <p>Second paragraph after table.</p>
    </sec>
  </body>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        assert!(extraction.content.contains("First paragraph"));
        assert!(extraction.content.contains("Second paragraph"));
        assert_eq!(extraction.tables.len(), 1);
    }

    /// Test extraction with multiple keyword groups
    #[tokio::test]
    async fn test_extract_multiple_keywords() {
        let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
  <front>
    <article-meta>
      <article-title>Keyword Test</article-title>
      <kwd-group xml:lang="en">
        <kwd>primary keyword</kwd>
        <kwd>secondary keyword</kwd>
      </kwd-group>
      <kwd-group xml:lang="es">
        <kwd>palabra clave</kwd>
      </kwd-group>
    </article-meta>
  </front>
</article>"#;

        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();
        let doc_result = extractor
            .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
            .await;

        assert!(doc_result.is_ok());
        let extraction = derive_extraction_result(
            doc_result.expect("Operation failed"),
            false,
            kreuzberg::OutputFormat::Plain,
        );

        let subject = extraction.metadata.subject.expect("Operation failed");
        assert!(subject.contains("keyword") || subject.contains("Keyword"));
    }

    /// Test full extraction workflow on file
    #[tokio::test]
    async fn test_extract_jats_file() {
        let extractor = JatsExtractor;
        let config = ExtractionConfig::default();

        let test_file = jats_fixture("sample_article.jats");
        if test_file.exists() {
            let doc_result = extractor
                .extract_file(&test_file, "application/x-jats+xml", &config)
                .await;

            assert!(doc_result.is_ok());
            let extraction = derive_extraction_result(
                doc_result.expect("Operation failed"),
                false,
                kreuzberg::OutputFormat::Plain,
            );

            assert!(!extraction.content.is_empty());
            assert!(extraction.metadata.subject.is_some());
        }
    }
}