//! EPUB integration tests. //! //! These tests validate EPUB-specific spine and navigation semantics. #![cfg(feature = "office")] use kreuzberg::core::config::{ExtractionConfig, OutputFormat}; use kreuzberg::extractors::EpubExtractor; use kreuzberg::plugins::DocumentExtractor; use kreuzberg::types::internal::{ElementKind, InternalDocument}; use std::io::{Cursor, Write}; use zip::write::FileOptions; fn content(document: &InternalDocument) -> String { if let Some(content) = &document.pre_rendered_content { return content.clone(); } document .elements .iter() .filter(|element| { !matches!( element.kind, ElementKind::ListStart { .. } | ElementKind::ListEnd | ElementKind::QuoteStart | ElementKind::QuoteEnd | ElementKind::GroupStart | ElementKind::GroupEnd | ElementKind::PageBreak | ElementKind::Image { .. } | ElementKind::Table { .. } ) }) .map(|element| element.text.as_str()) .filter(|text| !text.trim().is_empty()) .collect::>() .join("\n") } fn start_epub_writer(cursor: &mut Cursor>) -> zip::ZipWriter<&mut Cursor>> { let mut writer = zip::ZipWriter::new(cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer.start_file("mimetype", options).expect("zip start_file failed"); writer .write_all(b"application/epub+zip") .expect("zip write mimetype failed"); writer .add_directory("META-INF/", options) .expect("zip add_directory failed"); writer } fn build_epub3_with_navigation_and_auxiliary_spine_items() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Spine Semantics Test Book en "#; let intro_xhtml = r#" Intro

Intro

Opening paragraph.

"#; let nav_xhtml = r#" Table of Contents

Reading note outside navigation.

"#; let chapter_xhtml = r#" Chapter One

Chapter One

Main chapter text.

"#; let appendix_xhtml = r#" Appendix

Appendix

Auxiliary back matter.

"#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/content.opf", opf_xml), ("OEBPS/intro.xhtml", intro_xhtml), ("OEBPS/nav.xhtml", nav_xhtml), ("OEBPS/chapter1.xhtml", chapter_xhtml), ("OEBPS/appendix.xhtml", appendix_xhtml), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } fn build_epub2_with_guide_toc_in_spine() -> Vec { let container_xml = r#" "#; let opf_xml = r#" EPUB 2 TOC Test en "#; let toc_xhtml = r#" Contents

Contents

  1. Chapter One
  2. Appendix
"#; let chapter_xhtml = r#" Chapter One

Chapter One

Main chapter text.

"#; let appendix_xhtml = r#" Appendix

Appendix

Supplemental material.

"#; let ncx = r#" Chapter One "#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/content.opf", opf_xml), ("OEBPS/toc.xhtml", toc_xhtml), ("OEBPS/chapter1.xhtml", chapter_xhtml), ("OEBPS/appendix.xhtml", appendix_xhtml), ("OEBPS/toc.ncx", ncx), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } fn build_epub_with_fallback_content_document() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Fallback Test en "#; let chapter_xhtml = r#" Fallback Chapter

Fallback Chapter

Resolved through manifest fallback.

"#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/package/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/text/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/art/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/package/content.opf", opf_xml), ("OEBPS/text/chapter.xhtml", chapter_xhtml), ( "OEBPS/art/chapter.svg", "", ), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } fn build_epub_with_root_escaping_manifest_href() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Invalid Path Test en "#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/package/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/package/content.opf", opf_xml), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } fn build_epub_with_unused_invalid_manifest_asset() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Unused Asset Test en "#; let chapter_xhtml = r#" Chapter One

Chapter One

Main chapter text.

"#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/package/", options) .expect("zip add_directory failed"); writer .add_directory("OEBPS/text/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/package/content.opf", opf_xml), ("OEBPS/text/chapter.xhtml", chapter_xhtml), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } #[tokio::test] async fn test_epub3_excludes_navigation_but_keeps_non_linear_spine_content() { let bytes = build_epub3_with_navigation_and_auxiliary_spine_items(); let extractor = EpubExtractor; let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); assert!( result.processing_warnings.is_empty(), "Expected no warnings, got: {:?}", result.processing_warnings ); assert!( !content(&result).contains("?xml version"), "XML declarations should not leak into Markdown output:\n{}", content(&result) ); assert!( !content(&result).contains("Table of Contents"), "Navigation documents should not be rendered as body content:\n{}", content(&result) ); assert!( content(&result).contains("Reading note outside navigation."), "Expected prose outside specialized nav content to be preserved:\n{}", content(&result) ); assert!(content(&result).contains("# Intro"), "Expected intro heading"); assert!(content(&result).contains("# Chapter One"), "Expected chapter heading"); assert!( content(&result).contains("# Appendix"), "Non-linear spine content should still be extracted:\n{}", content(&result) ); assert!( content(&result).contains("Auxiliary back matter."), "Expected non-linear appendix text in extracted content" ); } #[tokio::test] async fn test_epub3_plain_output_excludes_specialized_navigation_but_keeps_body_prose() { let bytes = build_epub3_with_navigation_and_auxiliary_spine_items(); let extractor = EpubExtractor; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect("EPUB extraction should succeed"); assert!( content(&result).contains("Reading note outside navigation."), "Expected prose outside specialized nav content to be preserved:\n{}", content(&result) ); assert!( !content(&result).contains("Table of Contents"), "Specialized navigation content should stay out of plain-text extraction:\n{}", content(&result) ); assert!( content(&result).contains("Main chapter text."), "Expected real chapter body content in plain-text extraction:\n{}", content(&result) ); assert!( content(&result).contains("Auxiliary back matter."), "Expected non-linear spine prose to remain in plain-text extraction:\n{}", content(&result) ); } #[tokio::test] async fn test_epub_document_structure_excludes_navigation_but_keeps_non_linear_spine_content() { let bytes = build_epub3_with_navigation_and_auxiliary_spine_items(); let extractor = EpubExtractor; let config = ExtractionConfig { output_format: OutputFormat::Markdown, include_document_structure: true, ..Default::default() }; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); let all_text = content(&result); assert!( all_text.contains("Intro"), "Expected intro content in document structure" ); assert!( all_text.contains("Chapter One"), "Expected chapter content in document structure" ); assert!( all_text.contains("Reading note outside navigation."), "Expected non-nav prose from the navigation document in document structure" ); assert!( all_text.contains("Appendix"), "Expected non-linear appendix content in document structure" ); assert!( !all_text.contains("Table of Contents"), "Navigation documents should be excluded from document structure:\n{}", all_text ); } #[tokio::test] async fn test_epub2_guide_toc_document_is_excluded_but_auxiliary_content_remains() { let bytes = build_epub2_with_guide_toc_in_spine(); let extractor = EpubExtractor; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect("EPUB extraction should succeed"); assert!( !content(&result).contains("Contents"), "EPUB 2 guide TOC document should not be rendered as body content:\n{}", content(&result) ); assert!( content(&result).contains("Chapter One"), "Expected main chapter content in EPUB 2 extraction" ); assert!( content(&result).contains("Supplemental material."), "Expected non-linear appendix content in EPUB 2 extraction" ); } #[tokio::test] async fn test_epub_ignores_invalid_unused_manifest_assets_when_body_content_is_valid() { let bytes = build_epub_with_unused_invalid_manifest_asset(); let extractor = EpubExtractor; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect("EPUB extraction should succeed"); assert!( content(&result).contains("Chapter One"), "Expected valid spine content to be extracted" ); assert!( content(&result).contains("Main chapter text."), "Expected chapter body text to be extracted" ); } #[tokio::test] async fn test_epub_manifest_fallback_resolves_renderable_body_document() { let bytes = build_epub_with_fallback_content_document(); let extractor = EpubExtractor; let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &config) .await .expect("EPUB extraction should succeed"); assert!( result.processing_warnings.is_empty(), "Expected fallback resolution without warnings, got: {:?}", result.processing_warnings ); assert!( content(&result).contains("# Fallback Chapter"), "Expected heading from fallback XHTML document:\n{}", content(&result) ); assert!( content(&result).contains("Resolved through manifest fallback."), "Expected body text from fallback XHTML document" ); } #[tokio::test] async fn test_epub_rejects_manifest_paths_that_escape_package_root() { let bytes = build_epub_with_root_escaping_manifest_href(); let extractor = EpubExtractor; let err = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect_err("EPUB extraction should reject root-escaping manifest paths"); assert!( err.to_string().contains("escapes the package root"), "Expected root-escape validation error, got: {err}" ); } fn build_epub_with_manifest_fallback_cycle() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Fallback Cycle Test en "#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/content.opf", opf_xml), ("OEBPS/a.svg", ""), ("OEBPS/b.svg", ""), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } fn build_epub_with_empty_spine() -> Vec { let container_xml = r#" "#; let opf_xml = r#" Empty Spine Test en "#; let chapter_xhtml = r#" Chapter One

Chapter One

This content is in the manifest but not in the spine.

"#; let mut cursor = Cursor::new(Vec::::new()); let mut writer = start_epub_writer(&mut cursor); let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored); writer .add_directory("OEBPS/", options) .expect("zip add_directory failed"); for (path, contents) in [ ("META-INF/container.xml", container_xml), ("OEBPS/content.opf", opf_xml), ("OEBPS/chapter1.xhtml", chapter_xhtml), ] { writer.start_file(path, options).expect("zip start_file failed"); writer.write_all(contents.as_bytes()).expect("zip write file failed"); } writer.finish().expect("zip finish failed"); cursor.into_inner() } #[tokio::test] async fn test_epub_manifest_fallback_cycle_produces_warning_without_panic() { let bytes = build_epub_with_manifest_fallback_cycle(); let extractor = EpubExtractor; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect("EPUB extraction should not panic on fallback cycles"); let has_cycle_warning = result .processing_warnings .iter() .any(|w| w.message.contains("fallback cycle")); assert!( has_cycle_warning, "Expected a warning about fallback cycle, got warnings: {:?}", result.processing_warnings ); } #[tokio::test] async fn test_epub_empty_spine_produces_empty_content_without_error() { let bytes = build_epub_with_empty_spine(); let extractor = EpubExtractor; let result = extractor .extract_bytes(&bytes, "application/epub+zip", &ExtractionConfig::default()) .await .expect("EPUB extraction should succeed with empty spine"); assert!( content(&result).trim().is_empty(), "Expected empty or whitespace-only content for empty spine, got: '{}'", content(&result) ); }