469 lines
19 KiB
Rust
469 lines
19 KiB
Rust
|
|
//! Comprehensive Markdown Extractor Tests
|
||
|
|
//!
|
||
|
|
//! This test suite uses Pandoc as a baseline for validating markdown extraction capabilities.
|
||
|
|
//! It tests:
|
||
|
|
//! - YAML frontmatter metadata extraction (both standard and extended fields)
|
||
|
|
//! - Table extraction from various markdown table formats
|
||
|
|
//! - Complex formatting and structure preservation
|
||
|
|
//! - Comparison with Pandoc's metadata extraction capabilities
|
||
|
|
|
||
|
|
use std::path::PathBuf;
|
||
|
|
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use kreuzberg::extraction::derive::derive_extraction_result;
|
||
|
|
use kreuzberg::extractors::markdown::MarkdownExtractor;
|
||
|
|
use kreuzberg::plugins::DocumentExtractor;
|
||
|
|
|
||
|
|
fn markdown_fixture_path(relative: &str) -> PathBuf {
|
||
|
|
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||
|
|
.join("../../test_documents/markdown")
|
||
|
|
.join(relative)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn read_markdown_fixture(relative: &str) -> Vec<u8> {
|
||
|
|
let path = markdown_fixture_path(relative);
|
||
|
|
std::fs::read(&path).unwrap_or_else(|err| panic!("Failed to read markdown fixture {}: {}", path.display(), err))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test comprehensive YAML frontmatter with all Pandoc-recognized fields
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_pandoc_baseline_yaml_fields() {
|
||
|
|
let markdown_with_yaml = b"---\ntitle: Test Document\nauthor: John Doe\ndate: 2024-01-15\nkeywords:\n - markdown\n - testing\n - rust\ndescription: A comprehensive test document\nabstract: This is an abstract\nsubject: Testing Subject\ncategory: Documentation\ntags:\n - important\n - draft\nlanguage: en\nversion: 1.0.0\n---\n\n# Content\n\nThis is the main content.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown_with_yaml, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract markdown with frontmatter");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.title.as_deref(), Some("Test Document"));
|
||
|
|
assert_eq!(result.metadata.created_by.as_deref(), Some("John Doe"));
|
||
|
|
assert_eq!(result.metadata.created_at, Some("2024-01-15".to_string()));
|
||
|
|
|
||
|
|
assert!(result.metadata.keywords.is_some());
|
||
|
|
let keywords = result.metadata.keywords.as_ref().unwrap();
|
||
|
|
assert!(keywords.contains(&"markdown".to_string()));
|
||
|
|
assert!(keywords.contains(&"testing".to_string()));
|
||
|
|
assert!(keywords.contains(&"rust".to_string()));
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.abstract_text.as_deref(), Some("This is an abstract"));
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.subject, Some("Testing Subject".to_string()));
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.category.as_deref(), Some("Documentation"));
|
||
|
|
|
||
|
|
assert!(result.metadata.tags.is_some());
|
||
|
|
let tags = result.metadata.tags.as_ref().unwrap();
|
||
|
|
assert!(tags.contains(&"important".to_string()));
|
||
|
|
assert!(tags.contains(&"draft".to_string()));
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.language.as_deref(), Some("en"));
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.document_version.as_deref(), Some("1.0.0"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test table extraction from pipe-format markdown tables
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_simple_pipe_tables() {
|
||
|
|
let markdown = b"# Tables Example\n\n| Header 1 | Header 2 | Header 3 |\n|----------|----------|----------|\n| Row1Col1 | Row1Col2 | Row1Col3 |\n| Row2Col1 | Row2Col2 | Row2Col3 |";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract tables");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.tables.is_empty(), "Should extract at least one table");
|
||
|
|
let table = &result.tables[0];
|
||
|
|
|
||
|
|
assert_eq!(table.cells.len(), 3, "Should have 3 rows (header + 2 data rows)");
|
||
|
|
assert_eq!(table.cells[0].len(), 3, "Should have 3 columns");
|
||
|
|
|
||
|
|
assert_eq!(table.cells[0][0], "Header 1");
|
||
|
|
assert_eq!(table.cells[0][1], "Header 2");
|
||
|
|
assert_eq!(table.cells[0][2], "Header 3");
|
||
|
|
|
||
|
|
assert_eq!(table.cells[1][0], "Row1Col1");
|
||
|
|
assert_eq!(table.cells[2][0], "Row2Col1");
|
||
|
|
|
||
|
|
assert!(table.markdown.contains("Header 1"));
|
||
|
|
assert!(table.markdown.contains("Row1Col1"));
|
||
|
|
assert!(table.markdown.contains("---"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction of grid tables (as found in comprehensive.md)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_grid_tables() {
|
||
|
|
let markdown = b"# Grid Table Example\n\n+--------+--------+\n| Cell 1 | Cell 2 |\n+========+========+\n| Cell 3 | Cell 4 |\n+--------+--------+\n| Cell 5 | Cell 6 |\n+--------+--------+";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract grid tables");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
let _ = result.tables;
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction of tables with multiple blocks in cells
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_complex_table_cells() {
|
||
|
|
let markdown = b"# Complex Table\n\n| Header 1 | Header 2 |\n|----------|----------|\n| - bullet 1<br/>- bullet 2 | Simple text |\n| **Bold** *italic* | `code` |";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract tables with complex formatting");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.tables.is_empty());
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test multiline table from tables.markdown
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_pandoc_style_multiline_table() {
|
||
|
|
let markdown = b"Simple table with caption:\n\n Right Left Center Default\n ------- ------ -------- ---------\n 12 12 12 12\n 123 123 123 123\n 1 1 1 1\n\n : Demonstration of simple table syntax.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract pandoc-style tables");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("12") || result.content.contains("Demonstration"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test YAML frontmatter with author as list (Pandoc style)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_pandoc_author_list() {
|
||
|
|
let markdown = b"% Title\n% Author One; Author Two; Author Three\n\n# Content\n\nBody text.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract markdown");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test YAML with array keywords field (Pandoc format)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_keywords_array_extraction() {
|
||
|
|
let markdown =
|
||
|
|
b"---\ntitle: Document\nkeywords:\n - rust\n - markdown\n - pandoc\n---\n\n# Main Content\n\nText here.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract keywords array");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.metadata.keywords.is_some());
|
||
|
|
let keywords = result.metadata.keywords.as_ref().unwrap();
|
||
|
|
assert!(keywords.contains(&"rust".to_string()));
|
||
|
|
assert!(keywords.contains(&"markdown".to_string()));
|
||
|
|
assert!(keywords.contains(&"pandoc".to_string()));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test complex formatting in content (links, code, emphasis)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_complex_markdown_formatting() {
|
||
|
|
let markdown = b"# Document\n\nThis is a paragraph with [links](http://example.com) and `code blocks`.\n\n## Subsection\n\n- **Bold text**\n- *Italic text*\n- ***Bold italic***\n\n```rust\nfn main() {\n println!(\"Hello, world!\");\n}\n```";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract complex markdown");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("links"));
|
||
|
|
assert!(result.content.contains("code blocks"));
|
||
|
|
assert!(result.content.contains("Bold text"));
|
||
|
|
assert!(result.content.contains("println"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction of raw HTML and LaTeX in markdown
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_raw_content_extraction() {
|
||
|
|
let markdown = b"# Document\n\nSome text.\n\n<div>Raw HTML</div>\n\nMore text.\n\n\\\\begin{equation}\nx = y\n\\\\end{equation}";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract raw content");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test comprehensive.md from test_documents
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_comprehensive_md_extraction() {
|
||
|
|
let markdown = read_markdown_fixture("comprehensive.md");
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract comprehensive.md");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
|
||
|
|
let _has_title_or_author =
|
||
|
|
result.metadata.additional.contains_key("title") || result.metadata.additional.contains_key("author");
|
||
|
|
|
||
|
|
assert!(result.content.contains("Additional markdown reader tests") || result.content.contains("markdown"));
|
||
|
|
|
||
|
|
let _ = result.tables;
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test tables.markdown from test_documents
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_tables_markdown_extraction() {
|
||
|
|
let markdown = read_markdown_fixture("tables.markdown");
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(&markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract tables.markdown");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
|
||
|
|
assert!(result.content.contains("Right") || result.content.contains("Left") || result.content.contains("table"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test empty YAML frontmatter handling
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_empty_frontmatter() {
|
||
|
|
let markdown = b"---\n---\n\n# Main Title\n\nContent here.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should handle empty frontmatter");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("Main Title"));
|
||
|
|
assert!(result.content.contains("Content here"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test malformed YAML frontmatter fallback
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_malformed_frontmatter_graceful_fallback() {
|
||
|
|
let markdown = b"---\ninvalid: yaml: syntax: here:\n---\n\nContent here.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should handle malformed YAML gracefully");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(!result.content.is_empty());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test metadata field extraction for standard YAML fields
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_standard_yaml_metadata_fields() {
|
||
|
|
let markdown =
|
||
|
|
b"---\ntitle: Standard Fields Test\nauthor: Test Author\ndate: 2024-12-06\n---\n\n# Content\n\nTest body.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract standard fields");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert_eq!(result.metadata.title.as_deref(), Some("Standard Fields Test"));
|
||
|
|
assert_eq!(result.metadata.created_by.as_deref(), Some("Test Author"));
|
||
|
|
assert_eq!(result.metadata.created_at, Some("2024-12-06".to_string()));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction of description field (maps to subject)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_description_to_subject_mapping() {
|
||
|
|
let markdown = b"---\ntitle: Test\ndescription: This is the document description\n---\n\nContent.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract description");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert_eq!(
|
||
|
|
result.metadata.subject,
|
||
|
|
Some("This is the document description".to_string())
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test multi-line title extraction from YAML
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_multiline_title_in_yaml() {
|
||
|
|
let markdown = b"---\ntitle: |\n This is a\n multi-line title\nauthor: Test\n---\n\n# Content\n\nBody.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract multiline title");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
let title = result.metadata.title.as_deref();
|
||
|
|
assert!(title.is_some());
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test table page numbering
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_table_page_numbering() {
|
||
|
|
let markdown = b"# Document\n\n| A | B |\n|---|---|\n| 1 | 2 |\n\nSome text between tables.\n\n| X | Y |\n|---|---|\n| 3 | 4 |";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract multiple tables");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert_eq!(result.tables.len(), 2);
|
||
|
|
assert_eq!(result.tables[0].page_number, 1);
|
||
|
|
assert_eq!(result.tables[1].page_number, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test unicode content extraction
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_unicode_markdown_extraction() {
|
||
|
|
let markdown = "---\ntitle: Unicode Test\nauthor: 日本人\n---\n\n# こんにちは\n\nThis document has:\n- 中文 (Chinese)\n- 日本語 (Japanese)\n- Русский (Russian)\n- العربية (Arabic)".as_bytes();
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract unicode content");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("こんにちは") || result.content.contains("Chinese"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test YAML list to comma-separated conversion for keywords
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_keywords_list_comma_separation() {
|
||
|
|
let markdown = b"---\nkeywords:\n - first\n - second\n - third\n---\n\nContent.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract keywords list");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.metadata.keywords.is_some());
|
||
|
|
let keywords = result.metadata.keywords.as_ref().unwrap();
|
||
|
|
assert!(keywords.contains(&"first".to_string()));
|
||
|
|
assert!(keywords.contains(&"second".to_string()));
|
||
|
|
assert!(keywords.contains(&"third".to_string()));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction without any frontmatter
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_no_frontmatter_extraction() {
|
||
|
|
let markdown = b"# Document Title\n\nJust a document without frontmatter.\n\n## Section\n\nWith content.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract markdown without frontmatter");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("Document Title"));
|
||
|
|
assert!(result.content.contains("document") || result.content.contains("Section"));
|
||
|
|
|
||
|
|
let title = result.metadata.title.as_deref();
|
||
|
|
assert_eq!(title, Some("Document Title"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test code block extraction
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_code_block_extraction() {
|
||
|
|
let markdown = b"# Code Examples\n\n```rust\nfn add(a: i32, b: i32) -> i32 {\n a + b\n}\n```\n\n```python\ndef add(a, b):\n return a + b\n```";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract code blocks");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert!(result.content.contains("add"));
|
||
|
|
assert!(result.content.contains("return"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction with various mime types
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_supported_mime_types() {
|
||
|
|
let markdown = b"# Test\n\nContent.";
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
|
||
|
|
for mime_type in &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"] {
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, mime_type, &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.unwrap_or_else(|_| panic!("Should support {}", mime_type));
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
assert_eq!(result.mime_type, *mime_type);
|
||
|
|
assert!(result.content.contains("Test"));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test that metadata extraction handles nested YAML structures
|
||
|
|
/// (Currently not fully supported - documents what's missing)
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_nested_yaml_awareness() {
|
||
|
|
let markdown = b"---\ntitle: Test\nmetadata:\n organization: Test Corp\n location:\n city: San Francisco\n state: CA\n---\n\nContent.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract document");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
let title = result.metadata.title.as_deref();
|
||
|
|
assert_eq!(title, Some("Test"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test extraction with special characters in metadata
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_special_characters_in_metadata() {
|
||
|
|
let markdown = b"---\ntitle: \"Document: Part 1 & 2\"\nauthor: O'Brien\nkeywords: \"C++, C#, F#\"\n---\n\nContent.";
|
||
|
|
|
||
|
|
let extractor = MarkdownExtractor;
|
||
|
|
let doc = extractor
|
||
|
|
.extract_bytes(markdown, "text/markdown", &ExtractionConfig::default())
|
||
|
|
.await
|
||
|
|
.expect("Should extract with special characters");
|
||
|
|
let result = derive_extraction_result(doc, false, kreuzberg::OutputFormat::Plain);
|
||
|
|
|
||
|
|
let title = result.metadata.title.as_deref();
|
||
|
|
assert!(title.is_some());
|
||
|
|
assert!(title.expect("Operation failed").contains("&") || title.expect("Operation failed").contains("Part"));
|
||
|
|
}
|