Files
fil/crates/kreuzberg/tests/rtf_extractor_tests.rs

787 lines
27 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Comprehensive TDD test suite for RTF extraction.
//!
//! This test suite validates RTF extraction capabilities.
//! Tests cover:
//! - Accent and Unicode handling
//! - Bookmarks and internal links
//! - Footnotes and references
//! - Text formatting (bold, italic, underline, strikeout, superscript, subscript, small caps)
//! - Headings and structure
//! - Image extraction
//! - External hyperlinks
//! - List extraction (simple and complex nested lists)
//! - Table extraction (simple and complex with special formatting)
//! - Unicode characters and special symbols
//!
//! Test Organization:
//! - Basic Content Extraction (unicode, accent)
//! - Structure Preservation (heading, list_simple, list_complex)
//! - Table Extraction (table_simple, table_error_codes)
//! - Formatting Detection (formatting)
//! - Special Features (footnote, bookmark, link)
//! - Integration Tests (deterministic extraction, no content loss)
//!
//! Success Criteria:
//! - All tests passing (100%)
//! - No content loss (should extract meaningful text from all files)
//! - Deterministic extraction (same input = same output)
//!
//! Note: These tests require the `office` feature to be enabled.
#![cfg(feature = "office")]
#![allow(clippy::doc_suspicious_footnotes)]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_file;
use std::path::PathBuf;
mod helpers;
/// Helper function to get path to RTF test document
fn get_rtf_path(filename: &str) -> PathBuf {
let manifest_dir = env!("CARGO_MANIFEST_DIR");
PathBuf::from(manifest_dir)
.parent()
.expect("kreuzberg crate should have a parent")
.parent()
.expect("parent should have a parent")
.join("test_documents")
.join("rtf")
.join(filename)
}
/// Helper for reaching the workspace root from the kreuzberg crate
fn workspace_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.expect("kreuzberg crate should have a parent")
.parent()
.expect("workspace root exists")
.to_path_buf()
}
/// Test extraction of RTF file with accent characters (accented vowels).
///
/// File: accent.rtf
/// Content: "le café où on ne fume pas"
/// Expected: Correctly extracts French text with accented characters (é, ù)
/// Pandoc baseline: le café où on ne fume pas
#[tokio::test]
async fn test_rtf_accent_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("accent.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf");
let extraction = result.expect("Operation failed");
assert_eq!(extraction.mime_type, "application/rtf");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(
extraction.content.contains("café") || content.contains("cafe"),
"Should extract French word 'café' or 'cafe'"
);
assert!(
extraction.content.contains("") || content.contains("ou"),
"Should extract French word 'où' or 'ou'"
);
assert!(
content.contains("fume") || content.contains("smoking"),
"Should extract content about smoking"
);
}
/// Test extraction of RTF file with bookmarks (internal anchors/references).
///
/// File: bookmark.rtf
/// Content: Bookmark anchor labeled "Bookmark_1" and link text "click me"
/// Expected: Extracts bookmark definition and link text
/// Pandoc baseline: [Bookmark_1]{#bookmark_1} and [click me](#bookmark_1)
#[tokio::test]
async fn test_rtf_bookmark_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("bookmark.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf");
let extraction = result.expect("Operation failed");
let content = extraction.content.to_lowercase();
assert!(
content.contains("bookmark") || content.contains("click") || content.contains("me"),
"Should extract bookmark or link text (found: {})",
extraction.content
);
}
/// Test extraction of RTF file with footnotes.
///
/// File: footnote.rtf
/// Content: Academic text with footnote references and their content
/// Expected: Extracts both main text and footnote content
/// Pandoc baseline: Uses [^1] and [^2] syntax for footnotes
#[tokio::test]
async fn test_rtf_footnote_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("footnote.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(
content.contains("mead") || content.contains("landmark"),
"Should extract main text about Mead's study"
);
assert!(
content.contains("note")
|| content.contains("annotated")
|| content.contains("bibliography")
|| content.contains("sahlins"),
"Should extract footnote content or references"
);
assert!(
content.contains("footnote") || extraction.content.contains("[^") || content.contains("annotated"),
"Should contain footnote indicators"
);
}
/// Test extraction of RTF file with various text formatting.
///
/// File: formatting.rtf
/// Content: Text with bold, italic, underline, strikeout, superscript, subscript, small caps
/// Expected: Preserves or indicates all formatting types
/// Pandoc baseline: Detailed formatting in markdown syntax
#[tokio::test]
async fn test_rtf_formatting_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("formatting.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(
content.contains("formatting") || content.contains("test") || content.contains("bold"),
"Should extract formatting-related content"
);
assert!(
extraction.content.contains("**bold**") || content.contains("bold"),
"Should preserve or indicate bold text"
);
assert!(
extraction.content.contains("*italic") || content.contains("italic"),
"Should preserve or indicate italic text"
);
let has_formatting = extraction.content.contains("**")
|| extraction.content.contains("*")
|| extraction.content.contains("__")
|| extraction.content.contains("_")
|| extraction.content.contains("~~")
|| extraction.content.contains("^")
|| extraction.content.contains("~")
|| content.contains("bold");
assert!(has_formatting, "Should preserve or indicate text formatting");
}
/// Test extraction of RTF file with heading hierarchy.
///
/// File: heading.rtf
/// Content: Three levels of headings (H1, H2, H3) followed by paragraph
/// Expected: Extracts all headings and paragraph text
/// Pandoc baseline: Markdown heading syntax (# ## ###)
#[tokio::test]
async fn test_rtf_heading_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("heading.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
assert!(
extraction.content.contains("Heading 1"),
"Should extract Heading 1 text"
);
assert!(
extraction.content.contains("Heading 2"),
"Should extract Heading 2 text"
);
assert!(
extraction.content.contains("Heading 3"),
"Should extract Heading 3 text"
);
assert!(
extraction.content.contains("Paragraph"),
"Should extract paragraph text"
);
let content_lower = extraction.content.to_lowercase();
assert!(
extraction.content.contains("#")
|| (content_lower.contains("heading 1") && content_lower.contains("heading 2")),
"Should preserve heading hierarchy"
);
}
/// Test extraction of RTF file with embedded or referenced image.
///
/// File: image.rtf
/// Content: Image reference with dimensions (2.0in x 2.0in)
/// Expected: Extracts image reference and/or dimensions
/// Pandoc baseline: Markdown image syntax with dimensions
#[tokio::test]
async fn test_rtf_image_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("image.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for image.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(
extraction.content.contains("!")
|| content.contains("image")
|| extraction.content.contains(".jpg")
|| content.contains("2.0")
|| content.contains("width")
|| content.contains("height"),
"Should contain image reference or dimension information (found: {})",
extraction.content
);
}
/// Test extraction of RTF file with external hyperlink.
///
/// File: link.rtf
/// Content: Link to pandoc.org website
/// Expected: Extracts link text and/or URL
/// Pandoc baseline: Markdown link syntax [pandoc](http://pandoc.org)
#[tokio::test]
async fn test_rtf_link_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("link.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for link.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(
content.contains("pandoc") || content.contains("http"),
"Should extract link-related content (found: {})",
extraction.content
);
}
/// Test extraction of RTF file with complex nested list structure.
///
/// File: list_complex.rtf
/// Content: Multi-level nested list with various numbering (numeric, alphabetic, roman)
/// Expected: Extracts all list items preserving or indicating hierarchy
/// Pandoc baseline: Markdown nested list with mixed numbering schemes
#[tokio::test]
async fn test_rtf_list_complex_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("list_complex.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(content.contains("one"), "Should extract list item 'One'");
assert!(content.contains("two"), "Should extract list item 'Two'");
assert!(
content.contains("three") || content.contains("three"),
"Should extract nested list item 'Three'"
);
assert!(
content.contains("five") || content.contains("six"),
"Should extract deeply nested list items"
);
assert!(
extraction.content.contains("1")
|| extraction.content.contains("-")
|| extraction.content.contains("")
|| content.contains("one"),
"Should preserve list structure indicators"
);
assert!(
content.contains("out of list") || content.contains("out"),
"Should extract separator text 'Out of list'"
);
assert!(
content.contains("seven") || content.contains("eight") || content.contains("7") || content.contains("8"),
"Should extract restarted list numbering (7, 8)"
);
}
/// Test extraction of RTF file with simple bulleted list.
///
/// File: list_simple.rtf
/// Content: Simple bullet list with one nested item and list break
/// Expected: Extracts all list items and indicates nesting
/// Pandoc baseline: Simple markdown bullet list with nesting
#[tokio::test]
async fn test_rtf_list_simple_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("list_simple.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
let content = extraction.content.to_lowercase();
assert!(content.contains("one"), "Should extract list item 'one'");
assert!(content.contains("two"), "Should extract list item 'two'");
assert!(content.contains("sub"), "Should extract nested list item 'sub'");
assert!(content.contains("new"), "Should extract 'new list' text");
// Verify list items are extracted (list structure preserved as separate text)
assert!(
content.contains("one") && content.contains("two") && content.contains("sub"),
"Should extract all list items"
);
}
/// Test extraction of RTF file with table containing error codes.
///
/// File: table_error_codes.rtf
/// Content: Table with Code and Error columns, 23 rows of Pandoc error codes
/// Expected: Extracts table structure and all data cells
/// Pandoc baseline: Markdown table format with 2 columns and 23 rows
///
/// Note: RTF table extraction via Pandoc markdown output may result in empty content
/// due to limitations in Pandoc's markdown table rendering. Tables are present
/// in Pandoc's internal JSON representation but may not render in text format.
#[tokio::test]
async fn test_rtf_table_error_codes_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("table_error_codes.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(
result.is_ok(),
"RTF extraction should succeed for table_error_codes.rtf"
);
let extraction = result.expect("Operation failed");
assert!(
extraction.mime_type == "application/rtf",
"MIME type should be preserved"
);
}
/// Test extraction of RTF file with simple 4-column, 2-row table.
///
/// File: table_simple.rtf
/// Content: Table with headers A, B, C, D and data row E, F, G, H
/// Expected: Extracts all cells in correct table structure
/// Pandoc baseline: Markdown table format
///
/// Note: RTF table extraction via Pandoc markdown output may result in empty content
/// due to limitations in Pandoc's markdown table rendering. Tables are present
/// in Pandoc's internal JSON representation but may not render in text format.
#[tokio::test]
async fn test_rtf_table_simple_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("table_simple.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf");
let extraction = result.expect("Operation failed");
assert!(
extraction.mime_type == "application/rtf",
"MIME type should be preserved"
);
}
/// Test extraction of RTF file with various Unicode characters.
///
/// File: unicode.rtf
/// Content: Smart quotes, Greek letters (α, ä)
/// Expected: Correctly extracts and preserves Unicode characters
/// Pandoc baseline: "hi"'hi'αä
#[tokio::test]
async fn test_rtf_unicode_extraction() {
let config = ExtractionConfig::default();
let path = get_rtf_path("unicode.rtf");
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf");
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Content should not be empty");
assert!(
extraction.content.contains("hi") || extraction.content.contains("α") || extraction.content.contains("ä"),
"Should extract unicode content (found: {})",
extraction.content
);
}
/// Test that RTF extraction is deterministic
/// Same input should produce identical output
#[tokio::test]
async fn test_rtf_extraction_deterministic_unicode() {
let config = ExtractionConfig::default();
let path = get_rtf_path("unicode.rtf");
let result1 = extract_file(&path, Some("application/rtf"), &config).await;
let result2 = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
let extraction1 = result1.expect("Operation failed");
let extraction2 = result2.expect("Operation failed");
assert_eq!(
extraction1.content, extraction2.content,
"FAIL: Extraction is not deterministic. Same input produced different outputs."
);
}
/// Test that RTF extraction is deterministic for complex files
/// Same input should produce identical output
#[tokio::test]
async fn test_rtf_extraction_deterministic_list_complex() {
let config = ExtractionConfig::default();
let path = get_rtf_path("list_complex.rtf");
let result1 = extract_file(&path, Some("application/rtf"), &config).await;
let result2 = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");
let extraction1 = result1.expect("Operation failed");
let extraction2 = result2.expect("Operation failed");
assert_eq!(
extraction1.content, extraction2.content,
"FAIL: Extraction is not deterministic. Same input produced different outputs."
);
}
/// Test no critical content loss
/// All RTF files should extract non-empty content (except possibly image-only files)
#[tokio::test]
async fn test_rtf_no_critical_content_loss() {
let config = ExtractionConfig::default();
let must_extract = vec![
"unicode.rtf",
"accent.rtf",
"heading.rtf",
"list_simple.rtf",
"list_complex.rtf",
"formatting.rtf",
"footnote.rtf",
"bookmark.rtf",
"link.rtf",
];
for filename in must_extract {
let path = get_rtf_path(filename);
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(
result.is_ok(),
"FAIL: Extraction failed for {} (critical file)",
filename
);
let extraction = result.expect("Operation failed");
assert!(
!extraction.content.is_empty(),
"FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.",
filename
);
assert!(
extraction.content.len() >= 5,
"FAIL: Extracted only {} bytes from {} (expected at least 5 characters). Content: '{}'",
extraction.content.len(),
filename,
extraction.content
);
}
}
/// Test MIME type preservation
/// All RTF extractions should preserve the application/rtf MIME type
#[tokio::test]
async fn test_rtf_mime_type_preservation() {
let config = ExtractionConfig::default();
let test_files = vec!["unicode.rtf", "accent.rtf", "heading.rtf", "list_simple.rtf"];
for filename in test_files {
let path = get_rtf_path(filename);
let result = extract_file(&path, Some("application/rtf"), &config).await;
assert!(result.is_ok(), "Extraction should succeed for {}", filename);
let extraction = result.expect("Operation failed");
assert_eq!(
extraction.mime_type, "application/rtf",
"FAIL: MIME type not preserved for {}",
filename
);
}
}
/// Parity check: RTF extracted from the DOCX `word_sample.docx` should
/// carry the same content signals and metadata as the DOCX extractor.
#[tokio::test]
async fn test_rtf_word_sample_matches_docx_metadata_and_content() {
let root = workspace_root();
let rtf_path = root.join("test_documents/rtf/word_sample.rtf");
let docx_path = root.join("test_documents/docx/word_sample.docx");
if !rtf_path.exists() || !docx_path.exists() {
println!("Skipping word_sample parity test: fixtures missing");
return;
}
let config = ExtractionConfig::default();
let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config)
.await
.expect("RTF extraction should succeed for word_sample");
let docx_result = extract_file(&docx_path, None, &config)
.await
.expect("DOCX extraction should succeed for word_sample");
let rtf_content_lower = rtf_result.content.to_lowercase();
assert!(
rtf_content_lower.contains("swim"),
"RTF content should include the same body text as DOCX"
);
// Compare typed metadata fields (both DOCX and RTF now use typed fields)
assert_eq!(
rtf_result.metadata.created_by.as_deref(),
docx_result.metadata.created_by.as_deref(),
"Metadata field created_by should align with DOCX"
);
assert_eq!(
rtf_result.metadata.modified_by.as_deref(),
docx_result.metadata.modified_by.as_deref(),
"Metadata field modified_by should align with DOCX"
);
assert_eq!(
rtf_result.metadata.created_at.as_deref(),
docx_result.metadata.created_at.as_deref(),
"Metadata field created_at should align with DOCX"
);
assert_eq!(
rtf_result.metadata.additional.get("revision").and_then(|v| v.as_str()),
docx_result.metadata.additional.get("revision").and_then(|v| v.as_str()),
"Metadata field revision should align with DOCX"
);
for (key, expected) in [
("page_count", 2),
("word_count", 108),
("character_count", 620),
("line_count", 5),
("paragraph_count", 1),
] {
assert_eq!(
rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()),
Some(expected),
"Metadata field {} should match DOCX values",
key
);
}
}
/// RTF generated from lorem_ipsum.docx should expose the same document statistics
/// we validate for the DOCX extractor.
#[tokio::test]
async fn test_rtf_lorem_ipsum_metadata_alignment() {
let root = workspace_root();
let rtf_path = root.join("test_documents/rtf/lorem_ipsum.rtf");
if !rtf_path.exists() {
println!("Skipping lorem_ipsum metadata test: fixture missing");
return;
}
let config = ExtractionConfig::default();
let result = extract_file(&rtf_path, Some("application/rtf"), &config)
.await
.expect("RTF extraction should succeed for lorem_ipsum");
assert!(
result.content.to_lowercase().contains("lorem ipsum"),
"Content should contain lorem ipsum text"
);
for (key, expected) in [
("page_count", 1),
("word_count", 520),
("character_count", 2967),
("line_count", 24),
("paragraph_count", 6),
] {
assert_eq!(
result.metadata.additional.get(key).and_then(|v| v.as_i64()),
Some(expected),
"Metadata field {} should match DOCX values",
key
);
}
}
/// The comprehensive extraction fixture should mirror the coverage of the ODT/DOCX variants:
/// headings, section text, table content, and metadata fields should all be present.
#[tokio::test]
async fn test_rtf_comprehensive_extraction_alignment() {
let root = workspace_root();
let rtf_path = root.join("test_documents/rtf/extraction_test.rtf");
let docx_path = root.join("test_documents/docx/extraction_test.docx");
let odt_path = root.join("test_documents/odt/extraction_test.odt");
if !rtf_path.exists() {
println!("⚠️ Test document not found at {:?}, skipping", rtf_path);
return;
}
if !docx_path.exists() || !odt_path.exists() {
println!(
"⚠️ Companion DOCX/ODT documents missing (docx: {}, odt: {}), skipping",
docx_path.exists(),
odt_path.exists()
);
return;
}
let config = ExtractionConfig::default();
let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config)
.await
.expect("RTF extraction should succeed for extraction_test.rtf");
let docx_result = extract_file(&docx_path, None, &config)
.await
.expect("DOCX extraction should succeed for extraction_test.docx");
let odt_result = extract_file(&odt_path, None, &config)
.await
.expect("ODT extraction should succeed for extraction_test.odt");
assert!(
rtf_result.content.contains("Comprehensive Extraction Test Document"),
"Should include document heading"
);
assert!(
rtf_result.content.contains("First Section"),
"Should include first section heading"
);
assert!(
rtf_result.content.contains("Second Section"),
"Should include second section heading"
);
assert!(
rtf_result.content.contains("Third Section"),
"Should include third section heading"
);
for expected in ["Header 1", "Cell 1A", "Product", "Apple"] {
assert!(
rtf_result.content.contains(expected),
"Should include table content '{}'",
expected
);
}
// Verify table structure is recognized (tables are stored as structured nodes)
assert!(
!rtf_result.tables.is_empty(),
"Should extract structured tables from RTF"
);
assert!(
rtf_result
.tables
.iter()
.any(|t| t.markdown.contains("Header 1") || t.markdown.contains("Cell 1A")),
"Table markdown should include header/data cells"
);
assert!(
rtf_result.tables.len() >= docx_result.tables.len() && rtf_result.tables.len() >= odt_result.tables.len(),
"RTF should capture at least as many tables as DOCX/ODT"
);
for (key, expected) in [
("page_count", 1),
("word_count", 83),
("character_count", 475),
("line_count", 12),
("paragraph_count", 8),
] {
assert_eq!(
rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()),
Some(expected),
"Metadata field {} should be populated",
key
);
}
}