This commit is contained in:
456
crates/kreuzberg/tests/bibtex_parity_test.rs
Normal file
456
crates/kreuzberg/tests/bibtex_parity_test.rs
Normal file
@@ -0,0 +1,456 @@
|
||||
#![cfg(feature = "office")]
|
||||
//! Comprehensive test for BibTeX extractor parity with Pandoc
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::extraction::derive::derive_extraction_result;
|
||||
use kreuzberg::extractors::BibtexExtractor;
|
||||
use kreuzberg::plugins::DocumentExtractor;
|
||||
use kreuzberg::types::metadata::FormatMetadata;
|
||||
|
||||
mod helpers;
|
||||
use helpers::get_test_file_path;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_all_entry_types() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let test_cases = vec![
|
||||
(
|
||||
"@article{test, author={John Doe}, title={Test}, journal={Journal}, year={2023}}",
|
||||
"article",
|
||||
),
|
||||
(
|
||||
"@book{test, author={John Doe}, title={Test}, publisher={Publisher}, year={2023}}",
|
||||
"book",
|
||||
),
|
||||
(
|
||||
"@inproceedings{test, author={John Doe}, title={Test}, booktitle={Conference}, year={2023}}",
|
||||
"inproceedings",
|
||||
),
|
||||
(
|
||||
"@phdthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
||||
"phdthesis",
|
||||
),
|
||||
(
|
||||
"@mastersthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
||||
"mastersthesis",
|
||||
),
|
||||
(
|
||||
"@techreport{test, author={John Doe}, title={Test}, institution={Institute}, year={2023}}",
|
||||
"techreport",
|
||||
),
|
||||
("@manual{test, title={Test Manual}, year={2023}}", "manual"),
|
||||
("@misc{test, author={John Doe}, title={Test}, year={2023}}", "misc"),
|
||||
(
|
||||
"@unpublished{test, author={John Doe}, title={Test}, note={Unpublished}, year={2023}}",
|
||||
"unpublished",
|
||||
),
|
||||
(
|
||||
"@incollection{test, author={John Doe}, title={Test}, booktitle={Book}, publisher={Pub}, year={2023}}",
|
||||
"incollection",
|
||||
),
|
||||
(
|
||||
"@inbook{test, author={John Doe}, title={Test}, chapter={5}, publisher={Pub}, year={2023}}",
|
||||
"inbook",
|
||||
),
|
||||
(
|
||||
"@proceedings{test, title={Conference Proceedings}, year={2023}}",
|
||||
"proceedings",
|
||||
),
|
||||
("@booklet{test, title={Booklet}, year={2023}}", "booklet"),
|
||||
];
|
||||
|
||||
for (bibtex_content, expected_type) in test_cases {
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok(), "Failed to parse {} entry", expected_type);
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format
|
||||
&& let Some(ref entry_types) = bibtex.entry_types
|
||||
{
|
||||
assert!(!entry_types.is_empty(), "Entry types should not be empty");
|
||||
println!("Entry type '{}' extracted successfully", expected_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_all_common_fields() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{comprehensive,
|
||||
author = {Smith, John and Doe, Jane},
|
||||
title = {Comprehensive Test},
|
||||
journal = {Test Journal},
|
||||
year = {2023},
|
||||
volume = {42},
|
||||
number = {3},
|
||||
pages = {123--145},
|
||||
month = {June},
|
||||
doi = {10.1234/test.001},
|
||||
url = {https://example.com},
|
||||
issn = {1234-5678},
|
||||
isbn = {978-0-12-345678-9},
|
||||
abstract = {This is an abstract},
|
||||
keywords = {test, bibtex},
|
||||
note = {Additional notes},
|
||||
publisher = {Test Publisher},
|
||||
address = {Test City},
|
||||
edition = {2nd},
|
||||
editor = {Editor Name},
|
||||
series = {Test Series},
|
||||
organization = {Test Org},
|
||||
institution = {Test Institute},
|
||||
school = {Test School},
|
||||
howpublished = {Online},
|
||||
type = {Research Article},
|
||||
chapter = {5},
|
||||
booktitle = {Book Title}
|
||||
}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
let content = &result.content;
|
||||
|
||||
let expected_fields = vec![
|
||||
"author",
|
||||
"title",
|
||||
"journal",
|
||||
"year",
|
||||
"volume",
|
||||
"number",
|
||||
"pages",
|
||||
"month",
|
||||
"doi",
|
||||
"url",
|
||||
"issn",
|
||||
"isbn",
|
||||
"abstract",
|
||||
"keywords",
|
||||
"note",
|
||||
"publisher",
|
||||
"address",
|
||||
"edition",
|
||||
"editor",
|
||||
"series",
|
||||
"organization",
|
||||
"institution",
|
||||
"school",
|
||||
"howpublished",
|
||||
"type",
|
||||
"chapter",
|
||||
"booktitle",
|
||||
];
|
||||
|
||||
let num_fields = expected_fields.len();
|
||||
for field in expected_fields {
|
||||
assert!(content.contains(field), "Field '{}' should be present in output", field);
|
||||
}
|
||||
|
||||
println!("All {} fields were extracted", num_fields);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_author_parsing() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let test_cases = vec![
|
||||
("author = {John Doe}", vec!["John Doe"]),
|
||||
("author = {John Doe and Jane Smith}", vec!["John Doe", "Jane Smith"]),
|
||||
("author = {Smith, John and Doe, Jane}", vec!["Smith, John", "Doe, Jane"]),
|
||||
(
|
||||
"author = {John Doe and Jane Smith and Bob Jones}",
|
||||
vec!["John Doe", "Jane Smith", "Bob Jones"],
|
||||
),
|
||||
("author = {van der Berg, Hans}", vec!["van der Berg, Hans"]),
|
||||
("author = {Smith, Jr., John}", vec!["Smith, Jr., John"]),
|
||||
];
|
||||
|
||||
for (author_field, expected_authors) in test_cases {
|
||||
let bibtex = format!("@article{{test, {}, title={{Test}}, year={{2023}}}}", author_field);
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(authors) = &result.metadata.authors {
|
||||
for expected_author in &expected_authors {
|
||||
let found = authors.iter().any(|a| a.contains(expected_author));
|
||||
assert!(
|
||||
found,
|
||||
"Expected author '{}' not found in {:?}",
|
||||
expected_author, authors
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_special_characters() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{special,
|
||||
author = {M{\"u}ller, Hans and Sch{\"o}n, Anna and Garc{\'\i}a, Jos{\'e}},
|
||||
title = {Special Characters in {BibTeX}: {\"O}berblick},
|
||||
journal = {Test Journal},
|
||||
year = {2022}
|
||||
}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
assert_eq!(bibtex.entry_count, 1);
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
|
||||
if let Some(authors) = &result.metadata.authors {
|
||||
assert!(authors.len() >= 3, "Should have 3 authors");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_year_range_extraction() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{old, author={A}, title={Old}, year={1990}}
|
||||
@article{mid, author={B}, title={Mid}, year={2005}}
|
||||
@article{new, author={C}, title={New}, year={2023}}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
let year_range = bibtex.year_range.as_ref().expect("Year range not extracted");
|
||||
assert_eq!(year_range.min, Some(1990));
|
||||
assert_eq!(year_range.max, Some(2023));
|
||||
assert_eq!(year_range.years.len(), 3, "Should have 3 unique years");
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_citation_keys_extraction() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{key1, author={A}, title={T1}, year={2023}}
|
||||
@book{key2, author={B}, title={T2}, year={2023}}
|
||||
@inproceedings{key3, author={C}, title={T3}, year={2023}}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
assert_eq!(bibtex.citation_keys.len(), 3);
|
||||
|
||||
let expected_keys = vec!["key1", "key2", "key3"];
|
||||
for expected_key in expected_keys {
|
||||
let found = bibtex.citation_keys.iter().any(|k| k == expected_key);
|
||||
assert!(found, "Citation key '{}' not found", expected_key);
|
||||
}
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_entry_type_distribution() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{a1, author={A}, title={T1}, year={2023}}
|
||||
@article{a2, author={B}, title={T2}, year={2023}}
|
||||
@book{b1, author={C}, title={T3}, year={2023}}
|
||||
@inproceedings{c1, author={D}, title={T4}, year={2023}}
|
||||
@inproceedings{c2, author={E}, title={T5}, year={2023}}
|
||||
@inproceedings{c3, author={F}, title={T6}, year={2023}}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
let entry_types = bibtex.entry_types.as_ref().expect("Entry types not extracted");
|
||||
|
||||
assert_eq!(entry_types.get("article"), Some(&2));
|
||||
assert_eq!(entry_types.get("book"), Some(&1));
|
||||
assert_eq!(entry_types.get("inproceedings"), Some(&3));
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unicode_support() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{unicode,
|
||||
author = {Müller, Hans and Søren, Kierkegård},
|
||||
title = {Unicode in BibTeX: A Global Perspective},
|
||||
journal = {International Journal},
|
||||
year = {2023}
|
||||
}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
assert_eq!(bibtex.entry_count, 1);
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_fields() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let bibtex_content = r#"
|
||||
@article{empty,
|
||||
author = {Smith, John},
|
||||
title = {Test},
|
||||
journal = {},
|
||||
year = {2023},
|
||||
volume = {}
|
||||
}
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
assert_eq!(bibtex.entry_count, 1);
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_comprehensive_file() {
|
||||
let extractor = BibtexExtractor;
|
||||
|
||||
let fixture_path = get_test_file_path("bibtex/comprehensive.bib");
|
||||
let bibtex_content = std::fs::read(&fixture_path)
|
||||
.unwrap_or_else(|err| panic!("Failed to read test file at {}: {}", fixture_path.display(), err));
|
||||
|
||||
let config = ExtractionConfig::default();
|
||||
let doc_result = extractor
|
||||
.extract_bytes(&bibtex_content, "application/x-bibtex", &config)
|
||||
.await;
|
||||
|
||||
assert!(doc_result.is_ok());
|
||||
let result = derive_extraction_result(
|
||||
doc_result.expect("Operation failed"),
|
||||
false,
|
||||
kreuzberg::OutputFormat::Plain,
|
||||
);
|
||||
|
||||
if let Some(FormatMetadata::Bibtex(ref bibtex)) = result.metadata.format {
|
||||
assert_eq!(bibtex.entry_count, 20);
|
||||
|
||||
let entry_types = bibtex.entry_types.as_ref().expect("Entry types not extracted");
|
||||
assert!(entry_types.len() >= 10, "Should have at least 10 different entry types");
|
||||
|
||||
if let Some(authors) = &result.metadata.authors {
|
||||
assert!(authors.len() > 10, "Should have many unique authors");
|
||||
}
|
||||
|
||||
let year_range = bibtex.year_range.as_ref().expect("Year range not extracted");
|
||||
assert!(year_range.min.is_some());
|
||||
assert!(year_range.max.is_some());
|
||||
} else {
|
||||
panic!("Expected BibtexMetadata in format");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user