Files
fil/crates/kreuzberg/tests/jats_extractor_tests.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

693 lines
21 KiB
Rust

//! Comprehensive test suite for the JATS (Journal Article Tag Suite) extractor.
//!
//! This test suite validates all aspects of the JATS extractor including:
//! - Metadata extraction (title, authors, affiliations, DOI, keywords, dates)
//! - Article content extraction (sections, paragraphs)
//! - Table extraction with proper structure
//! - Citation handling
//! - Edge cases and error handling
#[cfg(all(test, feature = "xml"))]
mod jats_extractor_tests {
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::extractors::JatsExtractor;
use kreuzberg::plugins::{DocumentExtractor, Plugin};
use std::path::PathBuf;
fn jats_fixture(name: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../../test_documents/jats")
.join(name)
}
/// Test basic JATS article extraction with all key metadata fields
#[tokio::test]
async fn test_extract_complete_jats_article() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<article-meta>
<article-title>Effects of Caffeine on Human Health</article-title>
<subtitle>A Systematic Review and Meta-Analysis</subtitle>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Smith</surname>
<given-names>John A.</given-names>
</name>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Johnson</surname>
<given-names>Jane B.</given-names>
</name>
</contrib>
</contrib-group>
<aff id="aff1">Department of Medicine, Harvard University, Cambridge, MA</aff>
<article-id pub-id-type="doi">10.1371/journal.pmed.0020124</article-id>
<article-id pub-id-type="pii">05-PLME-RA-0071R2</article-id>
<pub-date pub-type="epub">
<day>18</day>
<month>04</month>
<year>2005</year>
</pub-date>
<volume>2</volume>
<issue>4</issue>
<fpage>e124</fpage>
<lpage>e132</lpage>
<kwd-group>
<kwd>caffeine</kwd>
<kwd>meta-analysis</kwd>
<kwd>systematic review</kwd>
</kwd-group>
<abstract>
<sec>
<title>Background</title>
<p>Caffeine is one of the most widely consumed psychoactive substances.</p>
</sec>
</abstract>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>This review examines the evidence for effects of caffeine.</p>
</sec>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.content.contains("Effects of Caffeine"));
assert!(extraction.content.contains("Introduction"));
assert!(extraction.metadata.subject.is_some());
let subject = extraction.metadata.subject.expect("Operation failed");
assert!(subject.contains("Effects of Caffeine"));
assert!(subject.contains("10.1371"));
assert!(subject.contains("caffeine") || subject.contains("Keywords"));
}
/// Test extraction of rich metadata including all author and affiliation data
#[tokio::test]
async fn test_extract_rich_author_affiliation_metadata() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Advanced Study</article-title>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Alpha</surname>
<given-names>First</given-names>
</name>
<xref ref-type="aff" rid="aff1">1</xref>
<xref ref-type="aff" rid="aff2">2</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Beta</surname>
<given-names>Second</given-names>
</name>
<xref ref-type="aff" rid="aff1">1</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gamma</surname>
<given-names>Third</given-names>
</name>
<xref ref-type="aff" rid="aff3">3</xref>
<role>Correspondence</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label>Department of Science, University A, City A</aff>
<aff id="aff2"><label>2</label>Research Institute, City B</aff>
<aff id="aff3"><label>3</label>Medical Center, City C</aff>
</article-meta>
</front>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
let subject = extraction.metadata.subject.expect("Operation failed");
assert!(subject.contains("Alpha"));
assert!(subject.contains("Beta"));
assert!(subject.contains("Gamma"));
assert!(subject.contains("Department of Science"));
}
/// Test section hierarchy extraction in article body
#[tokio::test]
async fn test_extract_section_hierarchy() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Research Article</article-title>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Introduction content here.</p>
</sec>
<sec id="s2">
<title>Methods</title>
<sec id="s2a">
<title>Study Design</title>
<p>Design content here.</p>
</sec>
<sec id="s2b">
<title>Participants</title>
<p>Participant content here.</p>
</sec>
</sec>
<sec id="s3">
<title>Results</title>
<p>Results content here.</p>
</sec>
<sec id="s4">
<title>Discussion</title>
<p>Discussion content here.</p>
</sec>
<sec id="s5">
<title>Conclusions</title>
<p>Conclusion content here.</p>
</sec>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.content.contains("Introduction"));
assert!(extraction.content.contains("Methods"));
assert!(extraction.content.contains("Results"));
assert!(extraction.content.contains("Discussion"));
assert!(extraction.content.contains("Conclusions"));
assert!(extraction.content.contains("Study Design"));
assert!(extraction.content.contains("Participants"));
}
/// Test table extraction with headers and data rows
#[tokio::test]
async fn test_extract_tables_with_captions() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Study Results</article-title>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Results</title>
<table-wrap id="tbl1">
<label>Table 1</label>
<caption>
<title>Characteristics of Study Population</title>
<p>Baseline characteristics of enrolled subjects.</p>
</caption>
<table>
<thead>
<tr>
<th>Parameter</th>
<th>Group A (n=50)</th>
<th>Group B (n=50)</th>
<th>P-value</th>
</tr>
</thead>
<tbody>
<tr>
<td>Age (years)</td>
<td>45.3 ± 8.2</td>
<td>44.9 ± 7.8</td>
<td>0.45</td>
</tr>
<tr>
<td>Sex (M/F)</td>
<td>28/22</td>
<td>26/24</td>
<td>0.58</td>
</tr>
<tr>
<td>BMI (kg/m²)</td>
<td>25.1 ± 3.2</td>
<td>24.8 ± 2.9</td>
<td>0.62</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert_eq!(extraction.tables.len(), 1);
let table = &extraction.tables[0];
assert!(table.cells.len() >= 3);
assert_eq!(table.cells[0].len(), 4);
assert!(table.cells[0][0].contains("Parameter"));
assert!(table.cells[1][0].contains("Age"));
assert!(table.cells[2][0].contains("Sex"));
}
/// Test multiple tables extraction
#[tokio::test]
async fn test_extract_multiple_tables() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Multi-Table Article</article-title>
</article-meta>
</front>
<body>
<table-wrap id="tbl1">
<table>
<thead>
<tr><th>A</th><th>B</th></tr>
</thead>
<tbody>
<tr><td>1</td><td>2</td></tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="tbl2">
<table>
<thead>
<tr><th>X</th><th>Y</th><th>Z</th></tr>
</thead>
<tbody>
<tr><td>a</td><td>b</td><td>c</td></tr>
</tbody>
</table>
</table-wrap>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert_eq!(extraction.tables.len(), 2);
assert_eq!(extraction.tables[0].cells[0].len(), 2);
assert_eq!(extraction.tables[1].cells[0].len(), 3);
}
/// Test citation extraction in text with xref elements
#[tokio::test]
async fn test_extract_citations_in_text() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Citation Study</article-title>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Previous research has shown effectiveness <xref ref-type="bibr" rid="ref1">1</xref>.
Other studies confirm this finding <xref ref-type="bibr" rid="ref2">2</xref>.</p>
</sec>
</body>
<back>
<ref-list>
<ref id="ref1">
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Author</surname>
<given-names>First</given-names>
</name>
</person-group>
<article-title>Original Research</article-title>
<source>Journal Name</source>
<year>2020</year>
</element-citation>
</ref>
<ref id="ref2">
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Researcher</surname>
<given-names>Second</given-names>
</name>
</person-group>
<article-title>Confirmatory Study</article-title>
<source>Other Journal</source>
<year>2021</year>
</element-citation>
</ref>
</ref-list>
</back>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.content.contains("Previous research"));
assert!(extraction.content.contains("Other studies"));
}
/// Test abstract extraction with structured sections
#[tokio::test]
async fn test_extract_structured_abstract() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Abstract Study</article-title>
<abstract>
<sec>
<title>Background</title>
<p>This is the background information of the study.</p>
</sec>
<sec>
<title>Methods and Findings</title>
<p>We used quantitative analysis to evaluate the hypothesis.</p>
</sec>
<sec>
<title>Conclusions</title>
<p>The study provides evidence that the hypothesis is correct.</p>
</sec>
</abstract>
</article-meta>
</front>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
let subject = extraction.metadata.subject.expect("Operation failed");
assert!(subject.contains("background") || subject.contains("Background") || subject.contains("Abstract"));
}
/// Test corresponding author extraction
#[tokio::test]
async fn test_extract_corresponding_author() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Study</article-title>
<author-notes>
<corresp id="cor1"><label>*</label>Corresponding author. E-mail: john.smith@example.com</corresp>
</author-notes>
</article-meta>
</front>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.metadata.subject.is_some());
}
/// Test publication date extraction in various formats
#[tokio::test]
async fn test_extract_publication_date() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Publication Test</article-title>
<pub-date pub-type="epub">
<day>15</day>
<month>06</month>
<year>2023</year>
</pub-date>
</article-meta>
</front>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.metadata.created_at.is_some());
}
/// Test handling of empty/minimal JATS documents
#[tokio::test]
async fn test_extract_minimal_jats() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
</article-meta>
</front>
<body>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.content.is_empty() || extraction.content.trim().is_empty());
}
/// Test MIME type support
#[test]
fn test_jats_supported_mime_types() {
let extractor = JatsExtractor;
let mime_types = extractor.supported_mime_types();
assert!(mime_types.contains(&"application/x-jats+xml"));
assert!(mime_types.contains(&"text/jats"));
}
/// Test extractor priority value
#[test]
fn test_jats_extractor_priority() {
let extractor = JatsExtractor;
assert_eq!(extractor.priority(), 50);
}
/// Test plugin interface compliance
#[test]
fn test_jats_plugin_interface() {
let extractor = JatsExtractor;
assert_eq!(extractor.name(), "jats-extractor");
assert!(!extractor.version().is_empty());
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
/// Test mixed content with tables and paragraphs
#[tokio::test]
async fn test_extract_mixed_content() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Mixed Content</article-title>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Analysis</title>
<p>First paragraph with data.</p>
<table-wrap id="tbl1">
<table>
<thead>
<tr><th>Data</th><th>Value</th></tr>
</thead>
<tbody>
<tr><td>Sample</td><td>100</td></tr>
</tbody>
</table>
</table-wrap>
<p>Second paragraph after table.</p>
</sec>
</body>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(extraction.content.contains("First paragraph"));
assert!(extraction.content.contains("Second paragraph"));
assert_eq!(extraction.tables.len(), 1);
}
/// Test extraction with multiple keyword groups
#[tokio::test]
async fn test_extract_multiple_keywords() {
let jats_content = r#"<?xml version="1.0" encoding="UTF-8"?>
<article>
<front>
<article-meta>
<article-title>Keyword Test</article-title>
<kwd-group xml:lang="en">
<kwd>primary keyword</kwd>
<kwd>secondary keyword</kwd>
</kwd-group>
<kwd-group xml:lang="es">
<kwd>palabra clave</kwd>
</kwd-group>
</article-meta>
</front>
</article>"#;
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let doc_result = extractor
.extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
let subject = extraction.metadata.subject.expect("Operation failed");
assert!(subject.contains("keyword") || subject.contains("Keyword"));
}
/// Test full extraction workflow on file
#[tokio::test]
async fn test_extract_jats_file() {
let extractor = JatsExtractor;
let config = ExtractionConfig::default();
let test_file = jats_fixture("sample_article.jats");
if test_file.exists() {
let doc_result = extractor
.extract_file(&test_file, "application/x-jats+xml", &config)
.await;
assert!(doc_result.is_ok());
let extraction = derive_extraction_result(
doc_result.expect("Operation failed"),
false,
kreuzberg::OutputFormat::Plain,
);
assert!(!extraction.content.is_empty());
assert!(extraction.metadata.subject.is_some());
}
}
}