Files
fil/crates/kreuzberg/tests/docbook_extractor_tests.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

503 lines
16 KiB
Rust

//! Comprehensive tests for DocBook extractor supporting both 4.x and 5.x versions.
#![cfg(feature = "xml")]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::plugins::{DocumentExtractor, Plugin};
use kreuzberg::types::ExtractionResult;
use std::path::PathBuf;
/// Helper to get absolute path to test documents
fn test_file_path(filename: &str) -> PathBuf {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
PathBuf::from(manifest_dir)
.parent()
.expect("Operation failed")
.parent()
.expect("Operation failed")
.join("test_documents")
.join("docbook")
.join(filename)
}
/// DocBook 4.x extractor test helper
async fn extract_docbook4_file(filename: &str) -> kreuzberg::Result<ExtractionResult> {
let extractor = kreuzberg::extractors::DocbookExtractor;
let path = test_file_path(filename);
let config = ExtractionConfig::default();
let doc = extractor
.extract_file(&path, "application/docbook+xml", &config)
.await?;
Ok(derive_extraction_result(doc, true, kreuzberg::OutputFormat::Plain))
}
/// DocBook 5.x extractor test helper
async fn extract_docbook5_file(filename: &str) -> kreuzberg::Result<ExtractionResult> {
let extractor = kreuzberg::extractors::DocbookExtractor;
let path = test_file_path(filename);
let config = ExtractionConfig::default();
let doc = extractor
.extract_file(&path, "application/docbook+xml", &config)
.await?;
Ok(derive_extraction_result(doc, true, kreuzberg::OutputFormat::Plain))
}
/// Helper to extract bytes directly
async fn extract_docbook_bytes(content: &[u8], mime_type: &str) -> kreuzberg::Result<ExtractionResult> {
let extractor = kreuzberg::extractors::DocbookExtractor;
let config = ExtractionConfig::default();
let doc = extractor.extract_bytes(content, mime_type, &config).await?;
Ok(derive_extraction_result(doc, true, kreuzberg::OutputFormat::Plain))
}
#[test]
fn test_docbook_extractor_plugin_interface() {
let extractor = kreuzberg::extractors::DocbookExtractor;
assert_eq!(extractor.name(), "docbook-extractor");
assert!(extractor.initialize().is_ok());
assert!(extractor.shutdown().is_ok());
}
#[test]
fn test_docbook_extractor_supported_mime_types() {
let extractor = kreuzberg::extractors::DocbookExtractor;
let mime_types = extractor.supported_mime_types();
assert!(mime_types.contains(&"application/docbook+xml"));
assert!(mime_types.contains(&"text/docbook"));
}
#[test]
fn test_docbook_extractor_priority() {
let extractor = kreuzberg::extractors::DocbookExtractor;
assert_eq!(extractor.priority(), 50);
}
#[tokio::test]
async fn test_docbook4_chapter_extraction() {
let result = extract_docbook4_file("docbook-chapter.docbook").await;
assert!(result.is_ok(), "Failed to extract DocBook 4 chapter");
let result = result.expect("Operation failed");
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(
result.content.contains("Test Chapter"),
"Content should contain chapter title"
);
assert!(
result.content.contains("Like a Sect1"),
"Content should contain section titles"
);
}
#[tokio::test]
async fn test_docbook5_reader_extraction() {
let result = extract_docbook5_file("docbook-reader.docbook").await;
assert!(result.is_ok(), "Failed to extract DocBook 5 file");
let result = result.expect("Operation failed");
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(
result.content.contains("Pandoc Test Suite"),
"Content should contain article title"
);
}
#[tokio::test]
async fn test_docbook_xref_extraction() {
let result = extract_docbook4_file("docbook-xref.docbook").await;
assert!(result.is_ok(), "Failed to extract DocBook with xref elements");
let result = result.expect("Operation failed");
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(
result.content.contains("An Example Book"),
"Content should contain book title"
);
assert!(
result.content.contains("XRef Samples"),
"Content should contain xref chapter"
);
}
#[tokio::test]
async fn test_docbook_tables_extraction() {
let result = extract_docbook4_file("tables.docbook4").await;
assert!(result.is_ok(), "Failed to extract DocBook with tables");
let result = result.expect("Operation failed");
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(!result.tables.is_empty(), "Should extract tables from DocBook");
}
#[tokio::test]
async fn test_docbook5_tables_extraction() {
let result = extract_docbook5_file("tables.docbook5").await;
assert!(result.is_ok(), "Failed to extract DocBook 5 with tables");
let result = result.expect("Operation failed");
assert!(!result.content.is_empty(), "Content should not be empty");
assert!(!result.tables.is_empty(), "Should extract tables from DocBook 5");
}
#[tokio::test]
async fn test_docbook_metadata_extraction() {
let result = extract_docbook5_file("docbook-reader.docbook").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(!result.content.is_empty());
}
#[tokio::test]
async fn test_docbook_section_hierarchy() {
let result = extract_docbook4_file("docbook-chapter.docbook").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
let content = &result.content;
assert!(content.contains("Like a Sect1"));
assert!(content.contains("Like a Sect2"));
assert!(content.contains("Like a Sect3"));
assert!(content.contains("Like a Sect4"));
}
#[tokio::test]
async fn test_docbook_paragraph_extraction() {
let result = extract_docbook4_file("docbook-chapter.docbook").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(
result.content.contains("This chapter uses recursive sections"),
"Should extract paragraph content"
);
}
#[tokio::test]
async fn test_docbook_paragraph_content() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Test Article</title>
<para>This is a test paragraph.</para>
<para>This is another paragraph with <emphasis>emphasized</emphasis> text.</para>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Test Article"));
assert!(result.content.contains("This is a test paragraph"));
assert!(result.content.contains("another paragraph"));
}
#[tokio::test]
async fn test_docbook_code_block_extraction() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<para>Here is code:</para>
<programlisting>
def hello():
print("world")
</programlisting>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("def hello"));
assert!(result.content.contains("print"));
}
#[tokio::test]
async fn test_docbook_mixed_content() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<book>
<title>Test Book</title>
<chapter>
<title>Chapter 1</title>
<section>
<title>Section 1.1</title>
<para>Paragraph in section.</para>
</section>
</chapter>
</book>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Test Book"));
assert!(result.content.contains("Chapter 1"));
assert!(result.content.contains("Section 1.1"));
assert!(result.content.contains("Paragraph in section"));
}
#[tokio::test]
async fn test_docbook_namespaced_5x_parsing() {
let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
<article xmlns="http://docbook.org/ns/docbook">
<info>
<title>DocBook 5 Article</title>
<author>
<personname>
<firstname>John</firstname>
<surname>Doe</surname>
</personname>
</author>
<date>2024-01-01</date>
</info>
<section>
<title>Introduction</title>
<para>Welcome to DocBook 5.</para>
</section>
</article>"#;
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("DocBook 5 Article"));
assert!(result.content.contains("Welcome to DocBook 5"));
}
#[tokio::test]
async fn test_docbook_link_handling() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Links Test</title>
<para>See <link xlink:href="http://example.com">example site</link>.</para>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("example"));
}
#[tokio::test]
async fn test_docbook_mime_type_detection() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Test</title>
</article>"#;
let result1 = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result1.is_ok());
let result2 = extract_docbook_bytes(docbook.as_bytes(), "text/docbook").await;
assert!(result2.is_ok());
}
#[tokio::test]
async fn test_docbook_empty_sections() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Empty Sections</title>
<section>
<title>Empty Section</title>
</section>
<section>
<title>Section with Content</title>
<para>Content here</para>
</section>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Empty Section"));
assert!(result.content.contains("Section with Content"));
assert!(result.content.contains("Content here"));
}
#[tokio::test]
async fn test_docbook_itemized_list() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>List Test</title>
<itemizedlist>
<listitem>
<para>First item</para>
</listitem>
<listitem>
<para>Second item</para>
</listitem>
<listitem>
<para>Third item</para>
</listitem>
</itemizedlist>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("First item"));
assert!(result.content.contains("Second item"));
assert!(result.content.contains("Third item"));
}
#[tokio::test]
async fn test_docbook_ordered_list() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Ordered List Test</title>
<orderedlist>
<listitem>
<para>First step</para>
</listitem>
<listitem>
<para>Second step</para>
</listitem>
<listitem>
<para>Third step</para>
</listitem>
</orderedlist>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("First step"));
assert!(result.content.contains("Second step"));
assert!(result.content.contains("Third step"));
}
#[tokio::test]
async fn test_docbook_blockquote() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Blockquote Test</title>
<blockquote>
<para>This is a quoted passage.</para>
</blockquote>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("quoted passage"));
}
#[tokio::test]
async fn test_docbook_figure() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Figure Test</title>
<figure>
<title>Sample Figure</title>
<para>This is a figure description.</para>
</figure>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Figure"));
}
#[tokio::test]
async fn test_docbook_footnote() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Footnote Test</title>
<para>Here is some text with a footnote<footnote><para>This is the footnote content</para></footnote>.</para>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("text with a footnote"));
assert!(result.content.contains("footnote content"));
}
#[tokio::test]
async fn test_docbook_mixed_content_with_lists() {
let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
<article>
<title>Mixed Content</title>
<para>Introduction paragraph.</para>
<itemizedlist>
<listitem>
<para>List item 1</para>
</listitem>
<listitem>
<para>List item 2</para>
</listitem>
</itemizedlist>
<para>Conclusion paragraph.</para>
<programlisting>
code example
</programlisting>
</article>"#;
let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Introduction paragraph"));
assert!(result.content.contains("List item 1"));
assert!(result.content.contains("List item 2"));
assert!(result.content.contains("Conclusion paragraph"));
assert!(result.content.contains("code example"));
}
#[tokio::test]
async fn test_docbook_namespaced_lists() {
let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
<article xmlns="http://docbook.org/ns/docbook">
<info>
<title>Lists in DocBook 5</title>
</info>
<itemizedlist>
<listitem>
<para>Namespaced item 1</para>
</listitem>
<listitem>
<para>Namespaced item 2</para>
</listitem>
</itemizedlist>
</article>"#;
let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
assert!(result.is_ok());
let result = result.expect("Operation failed");
assert!(result.content.contains("Namespaced item 1"));
assert!(result.content.contains("Namespaced item 2"));
}