//! Comprehensive Markdown Extractor Tests //! //! This test suite uses Pandoc as a baseline for validating markdown extraction capabilities. //! It tests: //! - YAML frontmatter metadata extraction (both standard and extended fields) //! - Table extraction from various markdown table formats //! - Complex formatting and structure preservation //! - Comparison with Pandoc's metadata extraction capabilities use std::path::PathBuf; use kreuzberg::core::config::ExtractionConfig; use kreuzberg::extraction::derive::derive_extraction_result; use kreuzberg::extractors::markdown::MarkdownExtractor; use kreuzberg::plugins::DocumentExtractor; fn markdown_fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("../../test_documents/markdown") .join(relative) } fn read_markdown_fixture(relative: &str) -> Vec { let path = markdown_fixture_path(relative); std::fs::read(&path).unwrap_or_else(|err| panic!("Failed to read markdown fixture {}: {}", path.display(), err)) } /// Test comprehensive YAML frontmatter with all Pandoc-recognized fields #[tokio::test] async fn test_pandoc_baseline_yaml_fields() { let markdown_with_yaml = b"---\ntitle: Test Document\nauthor: John Doe\ndate: 2024-01-15\nkeywords:\n - markdown\n - testing\n - rust\ndescription: A comprehensive test document\nabstract: This is an abstract\nsubject: Testing Subject\ncategory: Documentation\ntags:\n - important\n - draft\nlanguage: en\nversion: 1.0.0\n---\n\n# Content\n\nThis is the main content."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown_with_yaml, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract markdown with frontmatter"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!(result.metadata.title.as_deref(), Some("Test Document")); assert_eq!(result.metadata.created_by.as_deref(), Some("John Doe")); assert_eq!(result.metadata.created_at, Some("2024-01-15".to_string())); assert!(result.metadata.keywords.is_some()); let keywords = result.metadata.keywords.as_ref().unwrap(); assert!(keywords.contains(&"markdown".to_string())); assert!(keywords.contains(&"testing".to_string())); assert!(keywords.contains(&"rust".to_string())); assert_eq!(result.metadata.abstract_text.as_deref(), Some("This is an abstract")); assert_eq!(result.metadata.subject, Some("Testing Subject".to_string())); assert_eq!(result.metadata.category.as_deref(), Some("Documentation")); assert!(result.metadata.tags.is_some()); let tags = result.metadata.tags.as_ref().unwrap(); assert!(tags.contains(&"important".to_string())); assert!(tags.contains(&"draft".to_string())); assert_eq!(result.metadata.language.as_deref(), Some("en")); assert_eq!(result.metadata.document_version.as_deref(), Some("1.0.0")); } /// Test table extraction from pipe-format markdown tables #[tokio::test] async fn test_extract_simple_pipe_tables() { let markdown = b"# Tables Example\n\n| Header 1 | Header 2 | Header 3 |\n|----------|----------|----------|\n| Row1Col1 | Row1Col2 | Row1Col3 |\n| Row2Col1 | Row2Col2 | Row2Col3 |"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract tables"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.tables.is_empty(), "Should extract at least one table"); let table = &result.tables[0]; assert_eq!(table.cells.len(), 3, "Should have 3 rows (header + 2 data rows)"); assert_eq!(table.cells[0].len(), 3, "Should have 3 columns"); assert_eq!(table.cells[0][0], "Header 1"); assert_eq!(table.cells[0][1], "Header 2"); assert_eq!(table.cells[0][2], "Header 3"); assert_eq!(table.cells[1][0], "Row1Col1"); assert_eq!(table.cells[2][0], "Row2Col1"); assert!(table.markdown.contains("Header 1")); assert!(table.markdown.contains("Row1Col1")); assert!(table.markdown.contains("---")); } /// Test extraction of grid tables (as found in comprehensive.md) #[tokio::test] async fn test_extract_grid_tables() { let markdown = b"# Grid Table Example\n\n+--------+--------+\n| Cell 1 | Cell 2 |\n+========+========+\n| Cell 3 | Cell 4 |\n+--------+--------+\n| Cell 5 | Cell 6 |\n+--------+--------+"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract grid tables"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); let _ = result.tables; } /// Test extraction of tables with multiple blocks in cells #[tokio::test] async fn test_extract_complex_table_cells() { let markdown = b"# Complex Table\n\n| Header 1 | Header 2 |\n|----------|----------|\n| - bullet 1
- bullet 2 | Simple text |\n| **Bold** *italic* | `code` |"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract tables with complex formatting"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.tables.is_empty()); assert!(!result.content.is_empty()); } /// Test multiline table from tables.markdown #[tokio::test] async fn test_pandoc_style_multiline_table() { let markdown = b"Simple table with caption:\n\n Right Left Center Default\n ------- ------ -------- ---------\n 12 12 12 12\n 123 123 123 123\n 1 1 1 1\n\n : Demonstration of simple table syntax."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract pandoc-style tables"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("12") || result.content.contains("Demonstration")); } /// Test YAML frontmatter with author as list (Pandoc style) #[tokio::test] async fn test_pandoc_author_list() { let markdown = b"% Title\n% Author One; Author Two; Author Three\n\n# Content\n\nBody text."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract markdown"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.content.is_empty()); } /// Test YAML with array keywords field (Pandoc format) #[tokio::test] async fn test_keywords_array_extraction() { let markdown = b"---\ntitle: Document\nkeywords:\n - rust\n - markdown\n - pandoc\n---\n\n# Main Content\n\nText here."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract keywords array"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.metadata.keywords.is_some()); let keywords = result.metadata.keywords.as_ref().unwrap(); assert!(keywords.contains(&"rust".to_string())); assert!(keywords.contains(&"markdown".to_string())); assert!(keywords.contains(&"pandoc".to_string())); } /// Test complex formatting in content (links, code, emphasis) #[tokio::test] async fn test_complex_markdown_formatting() { let markdown = b"# Document\n\nThis is a paragraph with [links](http://example.com) and `code blocks`.\n\n## Subsection\n\n- **Bold text**\n- *Italic text*\n- ***Bold italic***\n\n```rust\nfn main() {\n println!(\"Hello, world!\");\n}\n```"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract complex markdown"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("links")); assert!(result.content.contains("code blocks")); assert!(result.content.contains("Bold text")); assert!(result.content.contains("println")); } /// Test extraction of raw HTML and LaTeX in markdown #[tokio::test] async fn test_raw_content_extraction() { let markdown = b"# Document\n\nSome text.\n\n
Raw HTML
\n\nMore text.\n\n\\\\begin{equation}\nx = y\n\\\\end{equation}"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract raw content"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.content.is_empty()); } /// Test comprehensive.md from test_documents #[tokio::test] async fn test_comprehensive_md_extraction() { let markdown = read_markdown_fixture("comprehensive.md"); let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract comprehensive.md"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.content.is_empty()); let _has_title_or_author = result.metadata.additional.contains_key("title") || result.metadata.additional.contains_key("author"); assert!(result.content.contains("Additional markdown reader tests") || result.content.contains("markdown")); let _ = result.tables; } /// Test tables.markdown from test_documents #[tokio::test] async fn test_tables_markdown_extraction() { let markdown = read_markdown_fixture("tables.markdown"); let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract tables.markdown"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.content.is_empty()); assert!(result.content.contains("Right") || result.content.contains("Left") || result.content.contains("table")); } /// Test empty YAML frontmatter handling #[tokio::test] async fn test_empty_frontmatter() { let markdown = b"---\n---\n\n# Main Title\n\nContent here."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should handle empty frontmatter"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("Main Title")); assert!(result.content.contains("Content here")); } /// Test malformed YAML frontmatter fallback #[tokio::test] async fn test_malformed_frontmatter_graceful_fallback() { let markdown = b"---\ninvalid: yaml: syntax: here:\n---\n\nContent here."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should handle malformed YAML gracefully"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(!result.content.is_empty()); } /// Test metadata field extraction for standard YAML fields #[tokio::test] async fn test_standard_yaml_metadata_fields() { let markdown = b"---\ntitle: Standard Fields Test\nauthor: Test Author\ndate: 2024-12-06\n---\n\n# Content\n\nTest body."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract standard fields"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!(result.metadata.title.as_deref(), Some("Standard Fields Test")); assert_eq!(result.metadata.created_by.as_deref(), Some("Test Author")); assert_eq!(result.metadata.created_at, Some("2024-12-06".to_string())); } /// Test extraction of description field (maps to subject) #[tokio::test] async fn test_description_to_subject_mapping() { let markdown = b"---\ntitle: Test\ndescription: This is the document description\n---\n\nContent."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract description"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!( result.metadata.subject, Some("This is the document description".to_string()) ); } /// Test multi-line title extraction from YAML #[tokio::test] async fn test_multiline_title_in_yaml() { let markdown = b"---\ntitle: |\n This is a\n multi-line title\nauthor: Test\n---\n\n# Content\n\nBody."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract multiline title"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); let title = result.metadata.title.as_deref(); assert!(title.is_some()); } /// Test table page numbering #[tokio::test] async fn test_table_page_numbering() { let markdown = b"# Document\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nSome text between tables.\n\n| X | Y |\n|---|---|\n| 3 | 4 |"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract multiple tables"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!(result.tables.len(), 2); assert_eq!(result.tables[0].page_number, 1); assert_eq!(result.tables[1].page_number, 1); } /// Test unicode content extraction #[tokio::test] async fn test_unicode_markdown_extraction() { let markdown = "---\ntitle: Unicode Test\nauthor: 日本人\n---\n\n# こんにちは\n\nThis document has:\n- 中文 (Chinese)\n- 日本語 (Japanese)\n- Русский (Russian)\n- العربية (Arabic)".as_bytes(); let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract unicode content"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("こんにちは") || result.content.contains("Chinese")); } /// Test YAML list to comma-separated conversion for keywords #[tokio::test] async fn test_keywords_list_comma_separation() { let markdown = b"---\nkeywords:\n - first\n - second\n - third\n---\n\nContent."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract keywords list"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.metadata.keywords.is_some()); let keywords = result.metadata.keywords.as_ref().unwrap(); assert!(keywords.contains(&"first".to_string())); assert!(keywords.contains(&"second".to_string())); assert!(keywords.contains(&"third".to_string())); } /// Test extraction without any frontmatter #[tokio::test] async fn test_no_frontmatter_extraction() { let markdown = b"# Document Title\n\nJust a document without frontmatter.\n\n## Section\n\nWith content."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract markdown without frontmatter"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("Document Title")); assert!(result.content.contains("document") || result.content.contains("Section")); let title = result.metadata.title.as_deref(); assert_eq!(title, Some("Document Title")); } /// Test code block extraction #[tokio::test] async fn test_code_block_extraction() { let markdown = b"# Code Examples\n\n```rust\nfn add(a: i32, b: i32) -> i32 {\n a + b\n}\n```\n\n```python\ndef add(a, b):\n return a + b\n```"; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract code blocks"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert!(result.content.contains("add")); assert!(result.content.contains("return")); } /// Test extraction with various mime types #[tokio::test] async fn test_supported_mime_types() { let markdown = b"# Test\n\nContent."; let extractor = MarkdownExtractor; for mime_type in &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"] { let doc = extractor .extract_bytes(markdown, mime_type, &ExtractionConfig::default()) .await .unwrap_or_else(|_| panic!("Should support {}", mime_type)); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); assert_eq!(result.mime_type, *mime_type); assert!(result.content.contains("Test")); } } /// Test that metadata extraction handles nested YAML structures /// (Currently not fully supported - documents what's missing) #[tokio::test] async fn test_nested_yaml_awareness() { let markdown = b"---\ntitle: Test\nmetadata:\n organization: Test Corp\n location:\n city: San Francisco\n state: CA\n---\n\nContent."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract document"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); let title = result.metadata.title.as_deref(); assert_eq!(title, Some("Test")); } /// Test extraction with special characters in metadata #[tokio::test] async fn test_special_characters_in_metadata() { let markdown = b"---\ntitle: \"Document: Part 1 & 2\"\nauthor: O'Brien\nkeywords: \"C++, C#, F#\"\n---\n\nContent."; let extractor = MarkdownExtractor; let doc = extractor .extract_bytes(markdown, "text/markdown", &ExtractionConfig::default()) .await .expect("Should extract with special characters"); let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain); let title = result.metadata.title.as_deref(); assert!(title.is_some()); assert!(title.expect("Operation failed").contains("&") || title.expect("Operation failed").contains("Part")); }