fil/crates/kreuzberg/tests/markdown_extractor_tests.rs

//! Comprehensive Markdown Extractor Tests
//!
//! This test suite uses Pandoc as a baseline for validating markdown extraction capabilities.
//! It tests:
//! - YAML frontmatter metadata extraction (both standard and extended fields)
//! - Table extraction from various markdown table formats
//! - Complex formatting and structure preservation
//! - Comparison with Pandoc's metadata extraction capabilities

use std::path::PathBuf;

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::extractors::markdown::MarkdownExtractor;
use kreuzberg::plugins::DocumentExtractor;

fn markdown_fixture_path(relative: &str) -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("../../test_documents/markdown")
        .join(relative)
}

fn read_markdown_fixture(relative: &str) -> Vec<u8> {
    let path = markdown_fixture_path(relative);
    std::fs::read(&path).unwrap_or_else(|err| panic!("Failed to read markdown fixture {}: {}", path.display(), err))
}

/// Test comprehensive YAML frontmatter with all Pandoc-recognized fields
#[tokio::test]
async fn test_pandoc_baseline_yaml_fields() {
    let markdown_with_yaml = b"---\ntitle: Test Document\nauthor: John Doe\ndate: 2024-01-15\nkeywords:\n  - markdown\n  - testing\n  - rust\ndescription: A comprehensive test document\nabstract: This is an abstract\nsubject: Testing Subject\ncategory: Documentation\ntags:\n  - important\n  - draft\nlanguage: en\nversion: 1.0.0\n---\n\n# Content\n\nThis is the main content.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown_with_yaml, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract markdown with frontmatter");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert_eq!(result.metadata.title.as_deref(), Some("Test Document"));
    assert_eq!(result.metadata.created_by.as_deref(), Some("John Doe"));
    assert_eq!(result.metadata.created_at, Some("2024-01-15".to_string()));

    assert!(result.metadata.keywords.is_some());
    let keywords = result.metadata.keywords.as_ref().unwrap();
    assert!(keywords.contains(&"markdown".to_string()));
    assert!(keywords.contains(&"testing".to_string()));
    assert!(keywords.contains(&"rust".to_string()));

    assert_eq!(result.metadata.abstract_text.as_deref(), Some("This is an abstract"));

    assert_eq!(result.metadata.subject, Some("Testing Subject".to_string()));

    assert_eq!(result.metadata.category.as_deref(), Some("Documentation"));

    assert!(result.metadata.tags.is_some());
    let tags = result.metadata.tags.as_ref().unwrap();
    assert!(tags.contains(&"important".to_string()));
    assert!(tags.contains(&"draft".to_string()));

    assert_eq!(result.metadata.language.as_deref(), Some("en"));

    assert_eq!(result.metadata.document_version.as_deref(), Some("1.0.0"));
}

/// Test table extraction from pipe-format markdown tables
#[tokio::test]
async fn test_extract_simple_pipe_tables() {
    let markdown = b"# Tables Example\n\n| Header 1 | Header 2 | Header 3 |\n|----------|----------|----------|\n| Row1Col1 | Row1Col2 | Row1Col3 |\n| Row2Col1 | Row2Col2 | Row2Col3 |";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract tables");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.tables.is_empty(), "Should extract at least one table");
    let table = &result.tables[0];

    assert_eq!(table.cells.len(), 3, "Should have 3 rows (header + 2 data rows)");
    assert_eq!(table.cells[0].len(), 3, "Should have 3 columns");

    assert_eq!(table.cells[0][0], "Header 1");
    assert_eq!(table.cells[0][1], "Header 2");
    assert_eq!(table.cells[0][2], "Header 3");

    assert_eq!(table.cells[1][0], "Row1Col1");
    assert_eq!(table.cells[2][0], "Row2Col1");

    assert!(table.markdown.contains("Header 1"));
    assert!(table.markdown.contains("Row1Col1"));
    assert!(table.markdown.contains("---"));
}

/// Test extraction of grid tables (as found in comprehensive.md)
#[tokio::test]
async fn test_extract_grid_tables() {
    let markdown = b"# Grid Table Example\n\n+--------+--------+\n| Cell 1 | Cell 2 |\n+========+========+\n| Cell 3 | Cell 4 |\n+--------+--------+\n| Cell 5 | Cell 6 |\n+--------+--------+";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract grid tables");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    let _ = result.tables;
}

/// Test extraction of tables with multiple blocks in cells
#[tokio::test]
async fn test_extract_complex_table_cells() {
    let markdown = b"# Complex Table\n\n| Header 1 | Header 2 |\n|----------|----------|\n| - bullet 1<br/>- bullet 2 | Simple text |\n| **Bold** *italic* | `code` |";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract tables with complex formatting");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.tables.is_empty());
    assert!(!result.content.is_empty());
}

/// Test multiline table from tables.markdown
#[tokio::test]
async fn test_pandoc_style_multiline_table() {
    let markdown = b"Simple table with caption:\n\n    Right Left    Center  Default\n  ------- ------ -------- ---------\n       12 12        12    12\n      123 123      123    123\n        1 1         1     1\n\n  : Demonstration of simple table syntax.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract pandoc-style tables");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("12") || result.content.contains("Demonstration"));
}

/// Test YAML frontmatter with author as list (Pandoc style)
#[tokio::test]
async fn test_pandoc_author_list() {
    let markdown = b"% Title\n% Author One; Author Two; Author Three\n\n# Content\n\nBody text.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract markdown");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.content.is_empty());
}

/// Test YAML with array keywords field (Pandoc format)
#[tokio::test]
async fn test_keywords_array_extraction() {
    let markdown =
        b"---\ntitle: Document\nkeywords:\n  - rust\n  - markdown\n  - pandoc\n---\n\n# Main Content\n\nText here.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract keywords array");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.metadata.keywords.is_some());
    let keywords = result.metadata.keywords.as_ref().unwrap();
    assert!(keywords.contains(&"rust".to_string()));
    assert!(keywords.contains(&"markdown".to_string()));
    assert!(keywords.contains(&"pandoc".to_string()));
}

/// Test complex formatting in content (links, code, emphasis)
#[tokio::test]
async fn test_complex_markdown_formatting() {
    let markdown = b"# Document\n\nThis is a paragraph with [links](http://example.com) and `code blocks`.\n\n## Subsection\n\n- **Bold text**\n- *Italic text*\n- ***Bold italic***\n\n```rust\nfn main() {\n    println!(\"Hello, world!\");\n}\n```";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract complex markdown");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("links"));
    assert!(result.content.contains("code blocks"));
    assert!(result.content.contains("Bold text"));
    assert!(result.content.contains("println"));
}

/// Test extraction of raw HTML and LaTeX in markdown
#[tokio::test]
async fn test_raw_content_extraction() {
    let markdown = b"# Document\n\nSome text.\n\n<div>Raw HTML</div>\n\nMore text.\n\n\\\\begin{equation}\nx = y\n\\\\end{equation}";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract raw content");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.content.is_empty());
}

/// Test comprehensive.md from test_documents
#[tokio::test]
async fn test_comprehensive_md_extraction() {
    let markdown = read_markdown_fixture("comprehensive.md");

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract comprehensive.md");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.content.is_empty());

    let _has_title_or_author =
        result.metadata.additional.contains_key("title") || result.metadata.additional.contains_key("author");

    assert!(result.content.contains("Additional markdown reader tests") || result.content.contains("markdown"));

    let _ = result.tables;
}

/// Test tables.markdown from test_documents
#[tokio::test]
async fn test_tables_markdown_extraction() {
    let markdown = read_markdown_fixture("tables.markdown");

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract tables.markdown");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.content.is_empty());

    assert!(result.content.contains("Right") || result.content.contains("Left") || result.content.contains("table"));
}

/// Test empty YAML frontmatter handling
#[tokio::test]
async fn test_empty_frontmatter() {
    let markdown = b"---\n---\n\n# Main Title\n\nContent here.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should handle empty frontmatter");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("Main Title"));
    assert!(result.content.contains("Content here"));
}

/// Test malformed YAML frontmatter fallback
#[tokio::test]
async fn test_malformed_frontmatter_graceful_fallback() {
    let markdown = b"---\ninvalid: yaml: syntax: here:\n---\n\nContent here.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should handle malformed YAML gracefully");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(!result.content.is_empty());
}

/// Test metadata field extraction for standard YAML fields
#[tokio::test]
async fn test_standard_yaml_metadata_fields() {
    let markdown =
        b"---\ntitle: Standard Fields Test\nauthor: Test Author\ndate: 2024-12-06\n---\n\n# Content\n\nTest body.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract standard fields");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert_eq!(result.metadata.title.as_deref(), Some("Standard Fields Test"));
    assert_eq!(result.metadata.created_by.as_deref(), Some("Test Author"));
    assert_eq!(result.metadata.created_at, Some("2024-12-06".to_string()));
}

/// Test extraction of description field (maps to subject)
#[tokio::test]
async fn test_description_to_subject_mapping() {
    let markdown = b"---\ntitle: Test\ndescription: This is the document description\n---\n\nContent.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract description");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert_eq!(
        result.metadata.subject,
        Some("This is the document description".to_string())
    );
}

/// Test multi-line title extraction from YAML
#[tokio::test]
async fn test_multiline_title_in_yaml() {
    let markdown = b"---\ntitle: |\n  This is a\n  multi-line title\nauthor: Test\n---\n\n# Content\n\nBody.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract multiline title");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    let title = result.metadata.title.as_deref();
    assert!(title.is_some());
}

/// Test table page numbering
#[tokio::test]
async fn test_table_page_numbering() {
    let markdown = b"# Document\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nSome text between tables.\n\n| X | Y |\n|---|---|\n| 3 | 4 |";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract multiple tables");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert_eq!(result.tables.len(), 2);
    assert_eq!(result.tables[0].page_number, 1);
    assert_eq!(result.tables[1].page_number, 1);
}

/// Test unicode content extraction
#[tokio::test]
async fn test_unicode_markdown_extraction() {
    let markdown = "---\ntitle: Unicode Test\nauthor: 日本人\n---\n\n# こんにちは\n\nThis document has:\n- 中文 (Chinese)\n- 日本語 (Japanese)\n- Русский (Russian)\n- العربية (Arabic)".as_bytes();

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract unicode content");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("こんにちは") || result.content.contains("Chinese"));
}

/// Test YAML list to comma-separated conversion for keywords
#[tokio::test]
async fn test_keywords_list_comma_separation() {
    let markdown = b"---\nkeywords:\n  - first\n  - second\n  - third\n---\n\nContent.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract keywords list");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.metadata.keywords.is_some());
    let keywords = result.metadata.keywords.as_ref().unwrap();
    assert!(keywords.contains(&"first".to_string()));
    assert!(keywords.contains(&"second".to_string()));
    assert!(keywords.contains(&"third".to_string()));
}

/// Test extraction without any frontmatter
#[tokio::test]
async fn test_no_frontmatter_extraction() {
    let markdown = b"# Document Title\n\nJust a document without frontmatter.\n\n## Section\n\nWith content.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract markdown without frontmatter");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("Document Title"));
    assert!(result.content.contains("document") || result.content.contains("Section"));

    let title = result.metadata.title.as_deref();
    assert_eq!(title, Some("Document Title"));
}

/// Test code block extraction
#[tokio::test]
async fn test_code_block_extraction() {
    let markdown = b"# Code Examples\n\n```rust\nfn add(a: i32, b: i32) -> i32 {\n    a + b\n}\n```\n\n```python\ndef add(a, b):\n    return a + b\n```";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract code blocks");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    assert!(result.content.contains("add"));
    assert!(result.content.contains("return"));
}

/// Test extraction with various mime types
#[tokio::test]
async fn test_supported_mime_types() {
    let markdown = b"# Test\n\nContent.";
    let extractor = MarkdownExtractor;

    for mime_type in &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"] {
        let doc = extractor
            .extract_bytes(markdown, mime_type, &ExtractionConfig::default())
            .await
            .unwrap_or_else(|_| panic!("Should support {}", mime_type));
        let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

        assert_eq!(result.mime_type, *mime_type);
        assert!(result.content.contains("Test"));
    }
}

/// Test that metadata extraction handles nested YAML structures
/// (Currently not fully supported - documents what's missing)
#[tokio::test]
async fn test_nested_yaml_awareness() {
    let markdown = b"---\ntitle: Test\nmetadata:\n  organization: Test Corp\n  location:\n    city: San Francisco\n    state: CA\n---\n\nContent.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract document");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    let title = result.metadata.title.as_deref();
    assert_eq!(title, Some("Test"));
}

/// Test extraction with special characters in metadata
#[tokio::test]
async fn test_special_characters_in_metadata() {
    let markdown = b"---\ntitle: \"Document: Part 1 & 2\"\nauthor: O'Brien\nkeywords: \"C++, C#, F#\"\n---\n\nContent.";

    let extractor = MarkdownExtractor;
    let doc = extractor
        .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
        .await
        .expect("Should extract with special characters");
    let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);

    let title = result.metadata.title.as_deref();
    assert!(title.is_some());
    assert!(title.expect("Operation failed").contains("&") || title.expect("Operation failed").contains("Part"));
}