183 lines
5.9 KiB
Rust
183 lines
5.9 KiB
Rust
//! Regression tests: EPUB headings should be preserved for Markdown/Djot output.
|
||
//!
|
||
//! The native EPUB extractor historically returned plain text only, flattening
|
||
//! `<h1>`–`<h6>` into regular lines. When `ExtractionConfig.output_format` is set
|
||
//! to Markdown (or Djot), we should run the XHTML through the HTML→Markdown
|
||
//! converter so headings become `#` / `##` etc.
|
||
|
||
#![cfg(feature = "office")]
|
||
|
||
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
||
use kreuzberg::extraction::derive::derive_extraction_result;
|
||
use kreuzberg::extractors::EpubExtractor;
|
||
use kreuzberg::plugins::DocumentExtractor;
|
||
use std::io::{Cursor, Write};
|
||
use zip::write::FileOptions;
|
||
|
||
fn build_minimal_epub_bytes() -> Vec<u8> {
|
||
let container_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||
<rootfiles>
|
||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||
</rootfiles>
|
||
</container>"#;
|
||
|
||
let opf_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||
<package version="3.0" unique-identifier="bookid" xmlns="http://www.idpf.org/2007/opf">
|
||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||
<dc:title>Test Book</dc:title>
|
||
<dc:language>en</dc:language>
|
||
</metadata>
|
||
<manifest>
|
||
<item id="c1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
|
||
</manifest>
|
||
<spine>
|
||
<itemref idref="c1"/>
|
||
</spine>
|
||
</package>"#;
|
||
|
||
let chapter_xhtml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||
<head><title>Chapter One</title></head>
|
||
<body>
|
||
<h1>Chapter One</h1>
|
||
<p>Some text.</p>
|
||
</body>
|
||
</html>"#;
|
||
|
||
let mut cursor = Cursor::new(Vec::<u8>::new());
|
||
let mut writer = zip::ZipWriter::new(&mut cursor);
|
||
let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
|
||
|
||
writer.start_file("mimetype", options).expect("zip start_file failed");
|
||
writer
|
||
.write_all(b"application/epub+zip")
|
||
.expect("zip write mimetype failed");
|
||
|
||
writer
|
||
.add_directory("META-INF/", options)
|
||
.expect("zip add_directory failed");
|
||
writer
|
||
.add_directory("OEBPS/", options)
|
||
.expect("zip add_directory failed");
|
||
|
||
writer
|
||
.start_file("META-INF/container.xml", options)
|
||
.expect("zip start_file failed");
|
||
writer
|
||
.write_all(container_xml.as_bytes())
|
||
.expect("zip write container.xml failed");
|
||
|
||
writer
|
||
.start_file("OEBPS/content.opf", options)
|
||
.expect("zip start_file failed");
|
||
writer
|
||
.write_all(opf_xml.as_bytes())
|
||
.expect("zip write content.opf failed");
|
||
|
||
writer
|
||
.start_file("OEBPS/chapter1.xhtml", options)
|
||
.expect("zip start_file failed");
|
||
writer
|
||
.write_all(chapter_xhtml.as_bytes())
|
||
.expect("zip write chapter1.xhtml failed");
|
||
|
||
writer.finish().expect("zip finish failed");
|
||
cursor.into_inner()
|
||
}
|
||
|
||
#[tokio::test]
|
||
async fn test_epub_markdown_output_keeps_headings() {
|
||
let bytes = build_minimal_epub_bytes();
|
||
let extractor = EpubExtractor;
|
||
|
||
let config = ExtractionConfig {
|
||
output_format: OutputFormat::Markdown,
|
||
..Default::default()
|
||
};
|
||
|
||
let doc = extractor
|
||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||
.await
|
||
.expect("EPUB extraction should succeed");
|
||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Markdown);
|
||
assert!(
|
||
result.processing_warnings.is_empty(),
|
||
"Expected no warnings, got: {:?}",
|
||
result.processing_warnings
|
||
);
|
||
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
|
||
assert!(
|
||
content_to_check.contains("# Chapter One"),
|
||
"Expected Markdown heading, got:\n{}",
|
||
content_to_check
|
||
);
|
||
assert!(
|
||
!content_to_check.starts_with("---"),
|
||
"Expected no YAML frontmatter injection, got:\n{}",
|
||
content_to_check
|
||
);
|
||
assert!(content_to_check.contains("Some text."), "Expected body text");
|
||
}
|
||
|
||
#[tokio::test]
|
||
async fn test_epub_djot_output_keeps_headings() {
|
||
let bytes = build_minimal_epub_bytes();
|
||
let extractor = EpubExtractor;
|
||
|
||
let config = ExtractionConfig {
|
||
output_format: OutputFormat::Djot,
|
||
..Default::default()
|
||
};
|
||
|
||
let doc = extractor
|
||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||
.await
|
||
.expect("EPUB extraction should succeed");
|
||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Djot);
|
||
|
||
assert!(
|
||
result.processing_warnings.is_empty(),
|
||
"Expected no warnings, got: {:?}",
|
||
result.processing_warnings
|
||
);
|
||
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
|
||
assert!(
|
||
content_to_check.contains("# Chapter One"),
|
||
"Expected Djot heading, got:\n{}",
|
||
content_to_check
|
||
);
|
||
assert!(
|
||
!content_to_check.starts_with("---"),
|
||
"Expected no YAML frontmatter injection, got:\n{}",
|
||
content_to_check
|
||
);
|
||
assert!(content_to_check.contains("Some text."), "Expected body text");
|
||
}
|
||
|
||
#[tokio::test]
|
||
async fn test_epub_plain_output_does_not_inject_markdown_headings() {
|
||
let bytes = build_minimal_epub_bytes();
|
||
let extractor = EpubExtractor;
|
||
|
||
let config = ExtractionConfig::default();
|
||
|
||
let doc = extractor
|
||
.extract_bytes(&bytes, "application/epub+zip", &config)
|
||
.await
|
||
.expect("EPUB extraction should succeed");
|
||
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
||
assert!(
|
||
result.processing_warnings.is_empty(),
|
||
"Expected no warnings, got: {:?}",
|
||
result.processing_warnings
|
||
);
|
||
assert!(
|
||
!result.content.contains("# Chapter One"),
|
||
"Plain output should not contain Markdown heading markers, got:\n{}",
|
||
result.content
|
||
);
|
||
assert!(result.content.contains("Chapter One"), "Expected heading text");
|
||
}
|