183 lines
5.9 KiB
Rust
183 lines
5.9 KiB
Rust
|
|
//! Regression tests: EPUB headings should be preserved for Markdown/Djot output.
|
|||
|
|
//!
|
|||
|
|
//! The native EPUB extractor historically returned plain text only, flattening
|
|||
|
|
//! `<h1>`–`<h6>` into regular lines. When `ExtractionConfig.output_format` is set
|
|||
|
|
//! to Markdown (or Djot), we should run the XHTML through the HTML→Markdown
|
|||
|
|
//! converter so headings become `#` / `##` etc.
|
|||
|
|
|
|||
|
|
#![cfg(feature = "office")]
|
|||
|
|
|
|||
|
|
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
|||
|
|
use kreuzberg::extraction::derive::derive_extraction_result;
|
|||
|
|
use kreuzberg::extractors::EpubExtractor;
|
|||
|
|
use kreuzberg::plugins::DocumentExtractor;
|
|||
|
|
use std::io::{Cursor, Write};
|
|||
|
|
use zip::write::FileOptions;
|
|||
|
|
|
|||
|
|
fn build_minimal_epub_bytes() -> Vec<u8> {
|
|||
|
|
let container_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|||
|
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
|||
|
|
<rootfiles>
|
|||
|
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
|||
|
|
</rootfiles>
|
|||
|
|
</container>"#;
|
|||
|
|
|
|||
|
|
let opf_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|||
|
|
<package version="3.0" unique-identifier="bookid" xmlns="http://www.idpf.org/2007/opf">
|
|||
|
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|||
|
|
<dc:title>Test Book</dc:title>
|
|||
|
|
<dc:language>en</dc:language>
|
|||
|
|
</metadata>
|
|||
|
|
<manifest>
|
|||
|
|
<item id="c1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
|
|||
|
|
</manifest>
|
|||
|
|
<spine>
|
|||
|
|
<itemref idref="c1"/>
|
|||
|
|
</spine>
|
|||
|
|
</package>"#;
|
|||
|
|
|
|||
|
|
let chapter_xhtml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|||
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|||
|
|
<head><title>Chapter One</title></head>
|
|||
|
|
<body>
|
|||
|
|
<h1>Chapter One</h1>
|
|||
|
|
<p>Some text.</p>
|
|||
|
|
</body>
|
|||
|
|
</html>"#;
|
|||
|
|
|
|||
|
|
let mut cursor = Cursor::new(Vec::<u8>::new());
|
|||
|
|
let mut writer = zip::ZipWriter::new(&mut cursor);
|
|||
|
|
let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
|
|||
|
|
|
|||
|
|
writer.start_file("mimetype", options).expect("zip start_file failed");
|
|||
|
|
writer
|
|||
|
|
.write_all(b"application/epub+zip")
|
|||
|
|
.expect("zip write mimetype failed");
|
|||
|
|
|
|||
|
|
writer
|
|||
|
|
.add_directory("META-INF/", options)
|
|||
|
|
.expect("zip add_directory failed");
|
|||
|
|
writer
|
|||
|
|
.add_directory("OEBPS/", options)
|
|||
|
|
.expect("zip add_directory failed");
|
|||
|
|
|
|||
|
|
writer
|
|||
|
|
.start_file("META-INF/container.xml", options)
|
|||
|
|
.expect("zip start_file failed");
|
|||
|
|
writer
|
|||
|
|
.write_all(container_xml.as_bytes())
|
|||
|
|
.expect("zip write container.xml failed");
|
|||
|
|
|
|||
|
|
writer
|
|||
|
|
.start_file("OEBPS/content.opf", options)
|
|||
|
|
.expect("zip start_file failed");
|
|||
|
|
writer
|
|||
|
|
.write_all(opf_xml.as_bytes())
|
|||
|
|
.expect("zip write content.opf failed");
|
|||
|
|
|
|||
|
|
writer
|
|||
|
|
.start_file("OEBPS/chapter1.xhtml", options)
|
|||
|
|
.expect("zip start_file failed");
|
|||
|
|
writer
|
|||
|
|
.write_all(chapter_xhtml.as_bytes())
|
|||
|
|
.expect("zip write chapter1.xhtml failed");
|
|||
|
|
|
|||
|
|
writer.finish().expect("zip finish failed");
|
|||
|
|
cursor.into_inner()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[tokio::test]
|
|||
|
|
async fn test_epub_markdown_output_keeps_headings() {
|
|||
|
|
let bytes = build_minimal_epub_bytes();
|
|||
|
|
let extractor = EpubExtractor;
|
|||
|
|
|
|||
|
|
let config = ExtractionConfig {
|
|||
|
|
output_format: OutputFormat::Markdown,
|
|||
|
|
..Default::default()
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
let doc = extractor
|
|||
|
|
.extract_bytes(&bytes, "application/epub+zip", &config)
|
|||
|
|
.await
|
|||
|
|
.expect("EPUB extraction should succeed");
|
|||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Markdown);
|
|||
|
|
assert!(
|
|||
|
|
result.processing_warnings.is_empty(),
|
|||
|
|
"Expected no warnings, got: {:?}",
|
|||
|
|
result.processing_warnings
|
|||
|
|
);
|
|||
|
|
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
|
|||
|
|
assert!(
|
|||
|
|
content_to_check.contains("# Chapter One"),
|
|||
|
|
"Expected Markdown heading, got:\n{}",
|
|||
|
|
content_to_check
|
|||
|
|
);
|
|||
|
|
assert!(
|
|||
|
|
!content_to_check.starts_with("---"),
|
|||
|
|
"Expected no YAML frontmatter injection, got:\n{}",
|
|||
|
|
content_to_check
|
|||
|
|
);
|
|||
|
|
assert!(content_to_check.contains("Some text."), "Expected body text");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[tokio::test]
|
|||
|
|
async fn test_epub_djot_output_keeps_headings() {
|
|||
|
|
let bytes = build_minimal_epub_bytes();
|
|||
|
|
let extractor = EpubExtractor;
|
|||
|
|
|
|||
|
|
let config = ExtractionConfig {
|
|||
|
|
output_format: OutputFormat::Djot,
|
|||
|
|
..Default::default()
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
let doc = extractor
|
|||
|
|
.extract_bytes(&bytes, "application/epub+zip", &config)
|
|||
|
|
.await
|
|||
|
|
.expect("EPUB extraction should succeed");
|
|||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Djot);
|
|||
|
|
|
|||
|
|
assert!(
|
|||
|
|
result.processing_warnings.is_empty(),
|
|||
|
|
"Expected no warnings, got: {:?}",
|
|||
|
|
result.processing_warnings
|
|||
|
|
);
|
|||
|
|
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
|
|||
|
|
assert!(
|
|||
|
|
content_to_check.contains("# Chapter One"),
|
|||
|
|
"Expected Djot heading, got:\n{}",
|
|||
|
|
content_to_check
|
|||
|
|
);
|
|||
|
|
assert!(
|
|||
|
|
!content_to_check.starts_with("---"),
|
|||
|
|
"Expected no YAML frontmatter injection, got:\n{}",
|
|||
|
|
content_to_check
|
|||
|
|
);
|
|||
|
|
assert!(content_to_check.contains("Some text."), "Expected body text");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[tokio::test]
|
|||
|
|
async fn test_epub_plain_output_does_not_inject_markdown_headings() {
|
|||
|
|
let bytes = build_minimal_epub_bytes();
|
|||
|
|
let extractor = EpubExtractor;
|
|||
|
|
|
|||
|
|
let config = ExtractionConfig::default();
|
|||
|
|
|
|||
|
|
let doc = extractor
|
|||
|
|
.extract_bytes(&bytes, "application/epub+zip", &config)
|
|||
|
|
.await
|
|||
|
|
.expect("EPUB extraction should succeed");
|
|||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
|||
|
|
|
|||
|
|
assert!(
|
|||
|
|
result.processing_warnings.is_empty(),
|
|||
|
|
"Expected no warnings, got: {:?}",
|
|||
|
|
result.processing_warnings
|
|||
|
|
);
|
|||
|
|
assert!(
|
|||
|
|
!result.content.contains("# Chapter One"),
|
|||
|
|
"Plain output should not contain Markdown heading markers, got:\n{}",
|
|||
|
|
result.content
|
|||
|
|
);
|
|||
|
|
assert!(result.content.contains("Chapter One"), "Expected heading text");
|
|||
|
|
}
|