//! Regression tests: EPUB headings should be preserved for Markdown/Djot output.
//!
//! The native EPUB extractor historically returned plain text only, flattening
//! `
`–`` into regular lines. When `ExtractionConfig.output_format` is set
//! to Markdown (or Djot), we should run the XHTML through the HTML→Markdown
//! converter so headings become `#` / `##` etc.
#![cfg(feature = "office")]
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::extractors::EpubExtractor;
use kreuzberg::plugins::DocumentExtractor;
use std::io::{Cursor, Write};
use zip::write::FileOptions;
fn build_minimal_epub_bytes() -> Vec {
let container_xml = r#"
"#;
let opf_xml = r#"
Test Book
en
"#;
let chapter_xhtml = r#"
Chapter One
Chapter One
Some text.
"#;
let mut cursor = Cursor::new(Vec::::new());
let mut writer = zip::ZipWriter::new(&mut cursor);
let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
writer.start_file("mimetype", options).expect("zip start_file failed");
writer
.write_all(b"application/epub+zip")
.expect("zip write mimetype failed");
writer
.add_directory("META-INF/", options)
.expect("zip add_directory failed");
writer
.add_directory("OEBPS/", options)
.expect("zip add_directory failed");
writer
.start_file("META-INF/container.xml", options)
.expect("zip start_file failed");
writer
.write_all(container_xml.as_bytes())
.expect("zip write container.xml failed");
writer
.start_file("OEBPS/content.opf", options)
.expect("zip start_file failed");
writer
.write_all(opf_xml.as_bytes())
.expect("zip write content.opf failed");
writer
.start_file("OEBPS/chapter1.xhtml", options)
.expect("zip start_file failed");
writer
.write_all(chapter_xhtml.as_bytes())
.expect("zip write chapter1.xhtml failed");
writer.finish().expect("zip finish failed");
cursor.into_inner()
}
#[tokio::test]
async fn test_epub_markdown_output_keeps_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Markdown);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
assert!(
content_to_check.contains("# Chapter One"),
"Expected Markdown heading, got:\n{}",
content_to_check
);
assert!(
!content_to_check.starts_with("---"),
"Expected no YAML frontmatter injection, got:\n{}",
content_to_check
);
assert!(content_to_check.contains("Some text."), "Expected body text");
}
#[tokio::test]
async fn test_epub_djot_output_keeps_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig {
output_format: OutputFormat::Djot,
..Default::default()
};
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Djot);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
assert!(
content_to_check.contains("# Chapter One"),
"Expected Djot heading, got:\n{}",
content_to_check
);
assert!(
!content_to_check.starts_with("---"),
"Expected no YAML frontmatter injection, got:\n{}",
content_to_check
);
assert!(content_to_check.contains("Some text."), "Expected body text");
}
#[tokio::test]
async fn test_epub_plain_output_does_not_inject_markdown_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig::default();
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
assert!(
!result.content.contains("# Chapter One"),
"Plain output should not contain Markdown heading markers, got:\n{}",
result.content
);
assert!(result.content.contains("Chapter One"), "Expected heading text");
}