crates/kreuzberg/tests/rtf_extractor_tests.rs

//! Comprehensive TDD test suite for RTF extraction.
//!
//! This test suite validates RTF extraction capabilities.
//! Tests cover:
//! - Accent and Unicode handling
//! - Bookmarks and internal links
//! - Footnotes and references
//! - Text formatting (bold, italic, underline, strikeout, superscript, subscript, small caps)
//! - Headings and structure
//! - Image extraction
//! - External hyperlinks
//! - List extraction (simple and complex nested lists)
//! - Table extraction (simple and complex with special formatting)
//! - Unicode characters and special symbols
//!
//! Test Organization:
//! - Basic Content Extraction (unicode, accent)
//! - Structure Preservation (heading, list_simple, list_complex)
//! - Table Extraction (table_simple, table_error_codes)
//! - Formatting Detection (formatting)
//! - Special Features (footnote, bookmark, link)
//! - Integration Tests (deterministic extraction, no content loss)
//!
//! Success Criteria:
//! - All tests passing (100%)
//! - No content loss (should extract meaningful text from all files)
//! - Deterministic extraction (same input = same output)
//!
//! Note: These tests require the `office` feature to be enabled.

#![cfg(feature = "office")]
#![allow(clippy::doc_suspicious_footnotes)]

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_file;
use std::path::PathBuf;

mod helpers;

/// Helper function to get path to RTF test document
fn get_rtf_path(filename: &str) -> PathBuf {
    let manifest_dir = env!("CARGO_MANIFEST_DIR");
    PathBuf::from(manifest_dir)
        .parent()
        .expect("kreuzberg crate should have a parent")
        .parent()
        .expect("parent should have a parent")
        .join("test_documents")
        .join("rtf")
        .join(filename)
}

/// Helper for reaching the workspace root from the kreuzberg crate
fn workspace_root() -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .expect("kreuzberg crate should have a parent")
        .parent()
        .expect("workspace root exists")
        .to_path_buf()
}

/// Test extraction of RTF file with accent characters (accented vowels).
///
/// File: accent.rtf
/// Content: "le café où on ne fume pas"
/// Expected: Correctly extracts French text with accented characters (é, ù)
/// Pandoc baseline: le café où on ne fume pas
#[tokio::test]
async fn test_rtf_accent_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("accent.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for accent.rtf");
    let extraction = result.expect("Operation failed");

    assert_eq!(extraction.mime_type, "application/rtf");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(
        extraction.content.contains("café") || content.contains("cafe"),
        "Should extract French word 'café' or 'cafe'"
    );

    assert!(
        extraction.content.contains("où") || content.contains("ou"),
        "Should extract French word 'où' or 'ou'"
    );

    assert!(
        content.contains("fume") || content.contains("smoking"),
        "Should extract content about smoking"
    );
}

/// Test extraction of RTF file with bookmarks (internal anchors/references).
///
/// File: bookmark.rtf
/// Content: Bookmark anchor labeled "Bookmark_1" and link text "click me"
/// Expected: Extracts bookmark definition and link text
/// Pandoc baseline: [Bookmark_1]{#bookmark_1} and [click me](#bookmark_1)
#[tokio::test]
async fn test_rtf_bookmark_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("bookmark.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for bookmark.rtf");
    let extraction = result.expect("Operation failed");

    let content = extraction.content.to_lowercase();

    assert!(
        content.contains("bookmark") || content.contains("click") || content.contains("me"),
        "Should extract bookmark or link text (found: {})",
        extraction.content
    );
}

/// Test extraction of RTF file with footnotes.
///
/// File: footnote.rtf
/// Content: Academic text with footnote references and their content
/// Expected: Extracts both main text and footnote content
/// Pandoc baseline: Uses [^1] and [^2] syntax for footnotes
#[tokio::test]
async fn test_rtf_footnote_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("footnote.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for footnote.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(
        content.contains("mead") || content.contains("landmark"),
        "Should extract main text about Mead's study"
    );

    assert!(
        content.contains("note")
            || content.contains("annotated")
            || content.contains("bibliography")
            || content.contains("sahlins"),
        "Should extract footnote content or references"
    );

    assert!(
        content.contains("footnote") || extraction.content.contains("[^") || content.contains("annotated"),
        "Should contain footnote indicators"
    );
}

/// Test extraction of RTF file with various text formatting.
///
/// File: formatting.rtf
/// Content: Text with bold, italic, underline, strikeout, superscript, subscript, small caps
/// Expected: Preserves or indicates all formatting types
/// Pandoc baseline: Detailed formatting in markdown syntax
#[tokio::test]
async fn test_rtf_formatting_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("formatting.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for formatting.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(
        content.contains("formatting") || content.contains("test") || content.contains("bold"),
        "Should extract formatting-related content"
    );

    assert!(
        extraction.content.contains("**bold**") || content.contains("bold"),
        "Should preserve or indicate bold text"
    );

    assert!(
        extraction.content.contains("*italic") || content.contains("italic"),
        "Should preserve or indicate italic text"
    );

    let has_formatting = extraction.content.contains("**")
        || extraction.content.contains("*")
        || extraction.content.contains("__")
        || extraction.content.contains("_")
        || extraction.content.contains("~~")
        || extraction.content.contains("^")
        || extraction.content.contains("~")
        || content.contains("bold");

    assert!(has_formatting, "Should preserve or indicate text formatting");
}

/// Test extraction of RTF file with heading hierarchy.
///
/// File: heading.rtf
/// Content: Three levels of headings (H1, H2, H3) followed by paragraph
/// Expected: Extracts all headings and paragraph text
/// Pandoc baseline: Markdown heading syntax (# ## ###)
#[tokio::test]
async fn test_rtf_heading_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("heading.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for heading.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    assert!(
        extraction.content.contains("Heading 1"),
        "Should extract Heading 1 text"
    );

    assert!(
        extraction.content.contains("Heading 2"),
        "Should extract Heading 2 text"
    );

    assert!(
        extraction.content.contains("Heading 3"),
        "Should extract Heading 3 text"
    );

    assert!(
        extraction.content.contains("Paragraph"),
        "Should extract paragraph text"
    );

    let content_lower = extraction.content.to_lowercase();
    assert!(
        extraction.content.contains("#")
            || (content_lower.contains("heading 1") && content_lower.contains("heading 2")),
        "Should preserve heading hierarchy"
    );
}

/// Test extraction of RTF file with embedded or referenced image.
///
/// File: image.rtf
/// Content: Image reference with dimensions (2.0in x 2.0in)
/// Expected: Extracts image reference and/or dimensions
/// Pandoc baseline: Markdown image syntax with dimensions
#[tokio::test]
async fn test_rtf_image_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("image.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for image.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(
        extraction.content.contains("!")
            || content.contains("image")
            || extraction.content.contains(".jpg")
            || content.contains("2.0")
            || content.contains("width")
            || content.contains("height"),
        "Should contain image reference or dimension information (found: {})",
        extraction.content
    );
}

/// Test extraction of RTF file with external hyperlink.
///
/// File: link.rtf
/// Content: Link to pandoc.org website
/// Expected: Extracts link text and/or URL
/// Pandoc baseline: Markdown link syntax [pandoc](http://pandoc.org)
#[tokio::test]
async fn test_rtf_link_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("link.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for link.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(
        content.contains("pandoc") || content.contains("http"),
        "Should extract link-related content (found: {})",
        extraction.content
    );
}

/// Test extraction of RTF file with complex nested list structure.
///
/// File: list_complex.rtf
/// Content: Multi-level nested list with various numbering (numeric, alphabetic, roman)
/// Expected: Extracts all list items preserving or indicating hierarchy
/// Pandoc baseline: Markdown nested list with mixed numbering schemes
#[tokio::test]
async fn test_rtf_list_complex_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("list_complex.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for list_complex.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(content.contains("one"), "Should extract list item 'One'");

    assert!(content.contains("two"), "Should extract list item 'Two'");

    assert!(
        content.contains("three") || content.contains("three"),
        "Should extract nested list item 'Three'"
    );

    assert!(
        content.contains("five") || content.contains("six"),
        "Should extract deeply nested list items"
    );

    assert!(
        extraction.content.contains("1")
            || extraction.content.contains("-")
            || extraction.content.contains("•")
            || content.contains("one"),
        "Should preserve list structure indicators"
    );

    assert!(
        content.contains("out of list") || content.contains("out"),
        "Should extract separator text 'Out of list'"
    );

    assert!(
        content.contains("seven") || content.contains("eight") || content.contains("7") || content.contains("8"),
        "Should extract restarted list numbering (7, 8)"
    );
}

/// Test extraction of RTF file with simple bulleted list.
///
/// File: list_simple.rtf
/// Content: Simple bullet list with one nested item and list break
/// Expected: Extracts all list items and indicates nesting
/// Pandoc baseline: Simple markdown bullet list with nesting
#[tokio::test]
async fn test_rtf_list_simple_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("list_simple.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for list_simple.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    let content = extraction.content.to_lowercase();

    assert!(content.contains("one"), "Should extract list item 'one'");

    assert!(content.contains("two"), "Should extract list item 'two'");

    assert!(content.contains("sub"), "Should extract nested list item 'sub'");

    assert!(content.contains("new"), "Should extract 'new list' text");

    // Verify list items are extracted (list structure preserved as separate text)
    assert!(
        content.contains("one") && content.contains("two") && content.contains("sub"),
        "Should extract all list items"
    );
}

/// Test extraction of RTF file with table containing error codes.
///
/// File: table_error_codes.rtf
/// Content: Table with Code and Error columns, 23 rows of Pandoc error codes
/// Expected: Extracts table structure and all data cells
/// Pandoc baseline: Markdown table format with 2 columns and 23 rows
///
/// Note: RTF table extraction via Pandoc markdown output may result in empty content
/// due to limitations in Pandoc's markdown table rendering. Tables are present
/// in Pandoc's internal JSON representation but may not render in text format.
#[tokio::test]
async fn test_rtf_table_error_codes_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("table_error_codes.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(
        result.is_ok(),
        "RTF extraction should succeed for table_error_codes.rtf"
    );
    let extraction = result.expect("Operation failed");

    assert!(
        extraction.mime_type == "application/rtf",
        "MIME type should be preserved"
    );
}

/// Test extraction of RTF file with simple 4-column, 2-row table.
///
/// File: table_simple.rtf
/// Content: Table with headers A, B, C, D and data row E, F, G, H
/// Expected: Extracts all cells in correct table structure
/// Pandoc baseline: Markdown table format
///
/// Note: RTF table extraction via Pandoc markdown output may result in empty content
/// due to limitations in Pandoc's markdown table rendering. Tables are present
/// in Pandoc's internal JSON representation but may not render in text format.
#[tokio::test]
async fn test_rtf_table_simple_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("table_simple.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for table_simple.rtf");
    let extraction = result.expect("Operation failed");

    assert!(
        extraction.mime_type == "application/rtf",
        "MIME type should be preserved"
    );
}

/// Test extraction of RTF file with various Unicode characters.
///
/// File: unicode.rtf
/// Content: Smart quotes, Greek letters (α, ä)
/// Expected: Correctly extracts and preserves Unicode characters
/// Pandoc baseline: "hi"'hi'αä
#[tokio::test]
async fn test_rtf_unicode_extraction() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("unicode.rtf");

    let result = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result.is_ok(), "RTF extraction should succeed for unicode.rtf");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");

    assert!(
        extraction.content.contains("hi") || extraction.content.contains("α") || extraction.content.contains("ä"),
        "Should extract unicode content (found: {})",
        extraction.content
    );
}

/// Test that RTF extraction is deterministic
/// Same input should produce identical output
#[tokio::test]
async fn test_rtf_extraction_deterministic_unicode() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("unicode.rtf");

    let result1 = extract_file(&path, Some("application/rtf"), &config).await;
    let result2 = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");

    let extraction1 = result1.expect("Operation failed");
    let extraction2 = result2.expect("Operation failed");

    assert_eq!(
        extraction1.content, extraction2.content,
        "FAIL: Extraction is not deterministic. Same input produced different outputs."
    );
}

/// Test that RTF extraction is deterministic for complex files
/// Same input should produce identical output
#[tokio::test]
async fn test_rtf_extraction_deterministic_list_complex() {
    let config = ExtractionConfig::default();
    let path = get_rtf_path("list_complex.rtf");

    let result1 = extract_file(&path, Some("application/rtf"), &config).await;
    let result2 = extract_file(&path, Some("application/rtf"), &config).await;

    assert!(result1.is_ok() && result2.is_ok(), "Both extractions should succeed");

    let extraction1 = result1.expect("Operation failed");
    let extraction2 = result2.expect("Operation failed");

    assert_eq!(
        extraction1.content, extraction2.content,
        "FAIL: Extraction is not deterministic. Same input produced different outputs."
    );
}

/// Test no critical content loss
/// All RTF files should extract non-empty content (except possibly image-only files)
#[tokio::test]
async fn test_rtf_no_critical_content_loss() {
    let config = ExtractionConfig::default();

    let must_extract = vec![
        "unicode.rtf",
        "accent.rtf",
        "heading.rtf",
        "list_simple.rtf",
        "list_complex.rtf",
        "formatting.rtf",
        "footnote.rtf",
        "bookmark.rtf",
        "link.rtf",
    ];

    for filename in must_extract {
        let path = get_rtf_path(filename);
        let result = extract_file(&path, Some("application/rtf"), &config).await;

        assert!(
            result.is_ok(),
            "FAIL: Extraction failed for {} (critical file)",
            filename
        );

        let extraction = result.expect("Operation failed");
        assert!(
            !extraction.content.is_empty(),
            "FAIL: CRITICAL - Extracted 0 bytes from {}. RTF extractor lost all content.",
            filename
        );

        assert!(
            extraction.content.len() >= 5,
            "FAIL: Extracted only {} bytes from {} (expected at least 5 characters). Content: '{}'",
            extraction.content.len(),
            filename,
            extraction.content
        );
    }
}

/// Test MIME type preservation
/// All RTF extractions should preserve the application/rtf MIME type
#[tokio::test]
async fn test_rtf_mime_type_preservation() {
    let config = ExtractionConfig::default();

    let test_files = vec!["unicode.rtf", "accent.rtf", "heading.rtf", "list_simple.rtf"];

    for filename in test_files {
        let path = get_rtf_path(filename);
        let result = extract_file(&path, Some("application/rtf"), &config).await;

        assert!(result.is_ok(), "Extraction should succeed for {}", filename);

        let extraction = result.expect("Operation failed");
        assert_eq!(
            extraction.mime_type, "application/rtf",
            "FAIL: MIME type not preserved for {}",
            filename
        );
    }
}

/// Parity check: RTF extracted from the DOCX `word_sample.docx` should
/// carry the same content signals and metadata as the DOCX extractor.
#[tokio::test]
async fn test_rtf_word_sample_matches_docx_metadata_and_content() {
    let root = workspace_root();
    let rtf_path = root.join("test_documents/rtf/word_sample.rtf");
    let docx_path = root.join("test_documents/docx/word_sample.docx");

    if !rtf_path.exists() || !docx_path.exists() {
        println!("Skipping word_sample parity test: fixtures missing");
        return;
    }

    let config = ExtractionConfig::default();
    let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config)
        .await
        .expect("RTF extraction should succeed for word_sample");
    let docx_result = extract_file(&docx_path, None, &config)
        .await
        .expect("DOCX extraction should succeed for word_sample");

    let rtf_content_lower = rtf_result.content.to_lowercase();
    assert!(
        rtf_content_lower.contains("swim"),
        "RTF content should include the same body text as DOCX"
    );

    // Compare typed metadata fields (both DOCX and RTF now use typed fields)
    assert_eq!(
        rtf_result.metadata.created_by.as_deref(),
        docx_result.metadata.created_by.as_deref(),
        "Metadata field created_by should align with DOCX"
    );
    assert_eq!(
        rtf_result.metadata.modified_by.as_deref(),
        docx_result.metadata.modified_by.as_deref(),
        "Metadata field modified_by should align with DOCX"
    );
    assert_eq!(
        rtf_result.metadata.created_at.as_deref(),
        docx_result.metadata.created_at.as_deref(),
        "Metadata field created_at should align with DOCX"
    );
    assert_eq!(
        rtf_result.metadata.additional.get("revision").and_then(|v| v.as_str()),
        docx_result.metadata.additional.get("revision").and_then(|v| v.as_str()),
        "Metadata field revision should align with DOCX"
    );

    for (key, expected) in [
        ("page_count", 2),
        ("word_count", 108),
        ("character_count", 620),
        ("line_count", 5),
        ("paragraph_count", 1),
    ] {
        assert_eq!(
            rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()),
            Some(expected),
            "Metadata field {} should match DOCX values",
            key
        );
    }
}

/// RTF generated from lorem_ipsum.docx should expose the same document statistics
/// we validate for the DOCX extractor.
#[tokio::test]
async fn test_rtf_lorem_ipsum_metadata_alignment() {
    let root = workspace_root();
    let rtf_path = root.join("test_documents/rtf/lorem_ipsum.rtf");

    if !rtf_path.exists() {
        println!("Skipping lorem_ipsum metadata test: fixture missing");
        return;
    }

    let config = ExtractionConfig::default();
    let result = extract_file(&rtf_path, Some("application/rtf"), &config)
        .await
        .expect("RTF extraction should succeed for lorem_ipsum");

    assert!(
        result.content.to_lowercase().contains("lorem ipsum"),
        "Content should contain lorem ipsum text"
    );

    for (key, expected) in [
        ("page_count", 1),
        ("word_count", 520),
        ("character_count", 2967),
        ("line_count", 24),
        ("paragraph_count", 6),
    ] {
        assert_eq!(
            result.metadata.additional.get(key).and_then(|v| v.as_i64()),
            Some(expected),
            "Metadata field {} should match DOCX values",
            key
        );
    }
}

/// The comprehensive extraction fixture should mirror the coverage of the ODT/DOCX variants:
/// headings, section text, table content, and metadata fields should all be present.
#[tokio::test]
async fn test_rtf_comprehensive_extraction_alignment() {
    let root = workspace_root();
    let rtf_path = root.join("test_documents/rtf/extraction_test.rtf");
    let docx_path = root.join("test_documents/docx/extraction_test.docx");
    let odt_path = root.join("test_documents/odt/extraction_test.odt");

    if !rtf_path.exists() {
        println!("⚠️  Test document not found at {:?}, skipping", rtf_path);
        return;
    }
    if !docx_path.exists() || !odt_path.exists() {
        println!(
            "⚠️  Companion DOCX/ODT documents missing (docx: {}, odt: {}), skipping",
            docx_path.exists(),
            odt_path.exists()
        );
        return;
    }

    let config = ExtractionConfig::default();
    let rtf_result = extract_file(&rtf_path, Some("application/rtf"), &config)
        .await
        .expect("RTF extraction should succeed for extraction_test.rtf");
    let docx_result = extract_file(&docx_path, None, &config)
        .await
        .expect("DOCX extraction should succeed for extraction_test.docx");
    let odt_result = extract_file(&odt_path, None, &config)
        .await
        .expect("ODT extraction should succeed for extraction_test.odt");

    assert!(
        rtf_result.content.contains("Comprehensive Extraction Test Document"),
        "Should include document heading"
    );
    assert!(
        rtf_result.content.contains("First Section"),
        "Should include first section heading"
    );
    assert!(
        rtf_result.content.contains("Second Section"),
        "Should include second section heading"
    );
    assert!(
        rtf_result.content.contains("Third Section"),
        "Should include third section heading"
    );

    for expected in ["Header 1", "Cell 1A", "Product", "Apple"] {
        assert!(
            rtf_result.content.contains(expected),
            "Should include table content '{}'",
            expected
        );
    }
    // Verify table structure is recognized (tables are stored as structured nodes)
    assert!(
        !rtf_result.tables.is_empty(),
        "Should extract structured tables from RTF"
    );
    assert!(
        rtf_result
            .tables
            .iter()
            .any(|t| t.markdown.contains("Header 1") || t.markdown.contains("Cell 1A")),
        "Table markdown should include header/data cells"
    );
    assert!(
        rtf_result.tables.len() >= docx_result.tables.len() && rtf_result.tables.len() >= odt_result.tables.len(),
        "RTF should capture at least as many tables as DOCX/ODT"
    );

    for (key, expected) in [
        ("page_count", 1),
        ("word_count", 83),
        ("character_count", 475),
        ("line_count", 12),
        ("paragraph_count", 8),
    ] {
        assert_eq!(
            rtf_result.metadata.additional.get(key).and_then(|v| v.as_i64()),
            Some(expected),
            "Metadata field {} should be populated",
            key
        );
    }
}