//! Comprehensive test suite for the JATS (Journal Article Tag Suite) extractor. //! //! This test suite validates all aspects of the JATS extractor including: //! - Metadata extraction (title, authors, affiliations, DOI, keywords, dates) //! - Article content extraction (sections, paragraphs) //! - Table extraction with proper structure //! - Citation handling //! - Edge cases and error handling #[cfg(all(test, feature = "xml"))] mod jats_extractor_tests { use kreuzberg::core::config::ExtractionConfig; use kreuzberg::extraction::derive::derive_extraction_result; use kreuzberg::extractors::JatsExtractor; use kreuzberg::plugins::{DocumentExtractor, Plugin}; use std::path::PathBuf; fn jats_fixture(name: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("../../test_documents/jats") .join(name) } /// Test basic JATS article extraction with all key metadata fields #[tokio::test] async fn test_extract_complete_jats_article() { let jats_content = r#"
Effects of Caffeine on Human Health A Systematic Review and Meta-Analysis Smith John A. Johnson Jane B. Department of Medicine, Harvard University, Cambridge, MA 10.1371/journal.pmed.0020124 05-PLME-RA-0071R2 18 04 2005 2 4 e124 e132 caffeine meta-analysis systematic review Background

Caffeine is one of the most widely consumed psychoactive substances.

Introduction

This review examines the evidence for effects of caffeine.

"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.content.contains("Effects of Caffeine")); assert!(extraction.content.contains("Introduction")); assert!(extraction.metadata.subject.is_some()); let subject = extraction.metadata.subject.expect("Operation failed"); assert!(subject.contains("Effects of Caffeine")); assert!(subject.contains("10.1371")); assert!(subject.contains("caffeine") || subject.contains("Keywords")); } /// Test extraction of rich metadata including all author and affiliation data #[tokio::test] async fn test_extract_rich_author_affiliation_metadata() { let jats_content = r#"
Advanced Study Alpha First 1 2 Beta Second 1 Gamma Third 3 Correspondence Department of Science, University A, City A Research Institute, City B Medical Center, City C
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); let subject = extraction.metadata.subject.expect("Operation failed"); assert!(subject.contains("Alpha")); assert!(subject.contains("Beta")); assert!(subject.contains("Gamma")); assert!(subject.contains("Department of Science")); } /// Test section hierarchy extraction in article body #[tokio::test] async fn test_extract_section_hierarchy() { let jats_content = r#"
Research Article Introduction

Introduction content here.

Methods Study Design

Design content here.

Participants

Participant content here.

Results

Results content here.

Discussion

Discussion content here.

Conclusions

Conclusion content here.

"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.content.contains("Introduction")); assert!(extraction.content.contains("Methods")); assert!(extraction.content.contains("Results")); assert!(extraction.content.contains("Discussion")); assert!(extraction.content.contains("Conclusions")); assert!(extraction.content.contains("Study Design")); assert!(extraction.content.contains("Participants")); } /// Test table extraction with headers and data rows #[tokio::test] async fn test_extract_tables_with_captions() { let jats_content = r#"
Study Results Results Characteristics of Study Population

Baseline characteristics of enrolled subjects.

Parameter Group A (n=50) Group B (n=50) P-value
Age (years) 45.3 ± 8.2 44.9 ± 7.8 0.45
Sex (M/F) 28/22 26/24 0.58
BMI (kg/m²) 25.1 ± 3.2 24.8 ± 2.9 0.62
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert_eq!(extraction.tables.len(), 1); let table = &extraction.tables[0]; assert!(table.cells.len() >= 3); assert_eq!(table.cells[0].len(), 4); assert!(table.cells[0][0].contains("Parameter")); assert!(table.cells[1][0].contains("Age")); assert!(table.cells[2][0].contains("Sex")); } /// Test multiple tables extraction #[tokio::test] async fn test_extract_multiple_tables() { let jats_content = r#"
Multi-Table Article
AB
12
XYZ
abc
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert_eq!(extraction.tables.len(), 2); assert_eq!(extraction.tables[0].cells[0].len(), 2); assert_eq!(extraction.tables[1].cells[0].len(), 3); } /// Test citation extraction in text with xref elements #[tokio::test] async fn test_extract_citations_in_text() { let jats_content = r#"
Citation Study Introduction

Previous research has shown effectiveness 1. Other studies confirm this finding 2.

Author First Original Research Journal Name 2020 Researcher Second Confirmatory Study Other Journal 2021
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.content.contains("Previous research")); assert!(extraction.content.contains("Other studies")); } /// Test abstract extraction with structured sections #[tokio::test] async fn test_extract_structured_abstract() { let jats_content = r#"
Abstract Study Background

This is the background information of the study.

Methods and Findings

We used quantitative analysis to evaluate the hypothesis.

Conclusions

The study provides evidence that the hypothesis is correct.

"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); let subject = extraction.metadata.subject.expect("Operation failed"); assert!(subject.contains("background") || subject.contains("Background") || subject.contains("Abstract")); } /// Test corresponding author extraction #[tokio::test] async fn test_extract_corresponding_author() { let jats_content = r#"
Study Corresponding author. E-mail: john.smith@example.com
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.metadata.subject.is_some()); } /// Test publication date extraction in various formats #[tokio::test] async fn test_extract_publication_date() { let jats_content = r#"
Publication Test 15 06 2023
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.metadata.created_at.is_some()); } /// Test handling of empty/minimal JATS documents #[tokio::test] async fn test_extract_minimal_jats() { let jats_content = r#"
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.content.is_empty() || extraction.content.trim().is_empty()); } /// Test MIME type support #[test] fn test_jats_supported_mime_types() { let extractor = JatsExtractor; let mime_types = extractor.supported_mime_types(); assert!(mime_types.contains(&"application/x-jats+xml")); assert!(mime_types.contains(&"text/jats")); } /// Test extractor priority value #[test] fn test_jats_extractor_priority() { let extractor = JatsExtractor; assert_eq!(extractor.priority(), 50); } /// Test plugin interface compliance #[test] fn test_jats_plugin_interface() { let extractor = JatsExtractor; assert_eq!(extractor.name(), "jats-extractor"); assert!(!extractor.version().is_empty()); assert!(extractor.initialize().is_ok()); assert!(extractor.shutdown().is_ok()); } /// Test mixed content with tables and paragraphs #[tokio::test] async fn test_extract_mixed_content() { let jats_content = r#"
Mixed Content Analysis

First paragraph with data.

DataValue
Sample100

Second paragraph after table.

"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(extraction.content.contains("First paragraph")); assert!(extraction.content.contains("Second paragraph")); assert_eq!(extraction.tables.len(), 1); } /// Test extraction with multiple keyword groups #[tokio::test] async fn test_extract_multiple_keywords() { let jats_content = r#"
Keyword Test primary keyword secondary keyword palabra clave
"#; let extractor = JatsExtractor; let config = ExtractionConfig::default(); let doc_result = extractor .extract_bytes(jats_content.as_bytes(), "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); let subject = extraction.metadata.subject.expect("Operation failed"); assert!(subject.contains("keyword") || subject.contains("Keyword")); } /// Test full extraction workflow on file #[tokio::test] async fn test_extract_jats_file() { let extractor = JatsExtractor; let config = ExtractionConfig::default(); let test_file = jats_fixture("sample_article.jats"); if test_file.exists() { let doc_result = extractor .extract_file(&test_file, "application/x-jats+xml", &config) .await; assert!(doc_result.is_ok()); let extraction = derive_extraction_result( doc_result.expect("Operation failed"), false, kreuzberg::OutputFormat::Plain, ); assert!(!extraction.content.is_empty()); assert!(extraction.metadata.subject.is_some()); } } }