//! Regression tests: EPUB headings should be preserved for Markdown/Djot output. //! //! The native EPUB extractor historically returned plain text only, flattening //! `

`–`

` into regular lines. When `ExtractionConfig.output_format` is set //! to Markdown (or Djot), we should run the XHTML through the HTML→Markdown //! converter so headings become `#` / `##` etc. #![cfg(feature = "office")] use kreuzberg::core::config::{ExtractionConfig, OutputFormat}; use kreuzberg::extraction::derive::derive_extraction_result; use kreuzberg::extractors::EpubExtractor; use kreuzberg::plugins::DocumentExtractor; use std::io::{Cursor, Write}; use zip::write::FileOptions; fn build_minimal_epub_bytes() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Test Book en "#; let chapter_xhtml = r#" Chapter One
Chapter One

Some text.
"#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = zip::ZipWriter::new(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer.start_file("mimetype", options).expect("zip start_file failed"); writer .write_all(b"application/epub+zip") .expect("zip write mimetype failed"); writer .add_directory("META-INF/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); writer .start_file("META-INF/container.xml", options) .expect("zip start_file failed"); writer .write_all(container_xml.as_bytes()) .expect("zip write container.xml failed"); writer .start_file("OEBPS/content.opf", options) .expect("zip start_file failed"); writer .write_all(opf_xml.as_bytes()) .expect("zip write content.opf failed"); writer .start_file("OEBPS/chapter1.xhtml", options) .expect("zip start_file failed"); writer .write_all(chapter_xhtml.as_bytes()) .expect("zip write chapter1.xhtml failed"); writer.finish().expect("zip finish failed"); cursor.into_inner() } #[tokio::test] async fn test_epub_markdown_output_keeps_headings() { let bytes = build_minimal_epub_bytes(); let extractor = EpubExtractor; let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let doc = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Markdown); assert!( result.processing_warnings.is_empty(), "Expected no warnings, got: {:?}", result.processing_warnings ); let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content); assert!( content_to_check.contains("# Chapter One"), "Expected Markdown heading, got:\n{}", content_to_check ); assert!( !content_to_check.starts_with("---"), "Expected no YAML frontmatter injection, got:\n{}", content_to_check ); assert!(content_to_check.contains("Some text."), "Expected body text"); } #[tokio::test] async fn test_epub_djot_output_keeps_headings() { let bytes = build_minimal_epub_bytes(); let extractor = EpubExtractor; let config = ExtractionConfig { output_format: OutputFormat::Djot, ..Default::default() }; let doc = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Djot); assert!( result.processing_warnings.is_empty(), "Expected no warnings, got: {:?}", result.processing_warnings ); let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content); assert!( content_to_check.contains("# Chapter One"), "Expected Djot heading, got:\n{}", content_to_check ); assert!( !content_to_check.starts_with("---"), "Expected no YAML frontmatter injection, got:\n{}", content_to_check ); assert!(content_to_check.contains("Some text."), "Expected body text"); } #[tokio::test] async fn test_epub_plain_output_does_not_inject_markdown_headings() { let bytes = build_minimal_epub_bytes(); let extractor = EpubExtractor; let config = ExtractionConfig::default(); let doc = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!( result.processing_warnings.is_empty(), "Expected no warnings, got: {:?}", result.processing_warnings ); assert!( !result.content.contains("# Chapter One"), "Plain output should not contain Markdown heading markers, got:\n{}", result.content ); assert!(result.content.contains("Chapter One"), "Expected heading text"); }