Files
fil/crates/kreuzberg/tests/epub_markdown_headings_tests.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

183 lines
5.9 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Regression tests: EPUB headings should be preserved for Markdown/Djot output.
//!
//! The native EPUB extractor historically returned plain text only, flattening
//! `<h1>``<h6>` into regular lines. When `ExtractionConfig.output_format` is set
//! to Markdown (or Djot), we should run the XHTML through the HTML→Markdown
//! converter so headings become `#` / `##` etc.
#![cfg(feature = "office")]
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::extractors::EpubExtractor;
use kreuzberg::plugins::DocumentExtractor;
use std::io::{Cursor, Write};
use zip::write::FileOptions;
fn build_minimal_epub_bytes() -> Vec<u8> {
let container_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>"#;
let opf_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<package version="3.0" unique-identifier="bookid" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Test Book</dc:title>
<dc:language>en</dc:language>
</metadata>
<manifest>
<item id="c1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="c1"/>
</spine>
</package>"#;
let chapter_xhtml = r#"<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Chapter One</title></head>
<body>
<h1>Chapter One</h1>
<p>Some text.</p>
</body>
</html>"#;
let mut cursor = Cursor::new(Vec::<u8>::new());
let mut writer = zip::ZipWriter::new(&mut cursor);
let options = FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
writer.start_file("mimetype", options).expect("zip start_file failed");
writer
.write_all(b"application/epub+zip")
.expect("zip write mimetype failed");
writer
.add_directory("META-INF/", options)
.expect("zip add_directory failed");
writer
.add_directory("OEBPS/", options)
.expect("zip add_directory failed");
writer
.start_file("META-INF/container.xml", options)
.expect("zip start_file failed");
writer
.write_all(container_xml.as_bytes())
.expect("zip write container.xml failed");
writer
.start_file("OEBPS/content.opf", options)
.expect("zip start_file failed");
writer
.write_all(opf_xml.as_bytes())
.expect("zip write content.opf failed");
writer
.start_file("OEBPS/chapter1.xhtml", options)
.expect("zip start_file failed");
writer
.write_all(chapter_xhtml.as_bytes())
.expect("zip write chapter1.xhtml failed");
writer.finish().expect("zip finish failed");
cursor.into_inner()
}
#[tokio::test]
async fn test_epub_markdown_output_keeps_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Markdown);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
assert!(
content_to_check.contains("# Chapter One"),
"Expected Markdown heading, got:\n{}",
content_to_check
);
assert!(
!content_to_check.starts_with("---"),
"Expected no YAML frontmatter injection, got:\n{}",
content_to_check
);
assert!(content_to_check.contains("Some text."), "Expected body text");
}
#[tokio::test]
async fn test_epub_djot_output_keeps_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig {
output_format: OutputFormat::Djot,
..Default::default()
};
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Djot);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
let content_to_check = result.formatted_content.as_deref().unwrap_or(&result.content);
assert!(
content_to_check.contains("# Chapter One"),
"Expected Djot heading, got:\n{}",
content_to_check
);
assert!(
!content_to_check.starts_with("---"),
"Expected no YAML frontmatter injection, got:\n{}",
content_to_check
);
assert!(content_to_check.contains("Some text."), "Expected body text");
}
#[tokio::test]
async fn test_epub_plain_output_does_not_inject_markdown_headings() {
let bytes = build_minimal_epub_bytes();
let extractor = EpubExtractor;
let config = ExtractionConfig::default();
let doc = extractor
.extract_bytes(&bytes, "application/epub+zip", &config)
.await
.expect("EPUB extraction should succeed");
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
assert!(
result.processing_warnings.is_empty(),
"Expected no warnings, got: {:?}",
result.processing_warnings
);
assert!(
!result.content.contains("# Chapter One"),
"Plain output should not contain Markdown heading markers, got:\n{}",
result.content
);
assert!(result.content.contains("Chapter One"), "Expected heading text");
}