//! HTML table parsing tests for `html-to-markdown-rs`. //! //! Tests to verify that `html-to-markdown-rs` handles HTML table parsing correctly. //! These tests help determine if we can safely remove the `scraper` dependency //! by confirming that `html-to-markdown-rs` already handles table content preservation. #[cfg(feature = "html")] mod html_table_tests { use kreuzberg::extraction::html::convert_html_to_markdown; /// Test basic table HTML to markdown conversion. /// /// Verifies that: /// - Table structure is recognized /// - Header row (th) content is preserved /// - Data rows (td) content is preserved /// - All cell values are retained in output #[test] fn test_basic_table_parsing() { let html = r#"
Name Age
Alice 30
Bob 25
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "HTML to markdown conversion should succeed"); let markdown = result.expect("Operation failed"); println!("=== Basic Table Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("========================\n"); assert!(markdown.contains("Name"), "Should contain header 'Name'"); assert!(markdown.contains("Age"), "Should contain header 'Age'"); assert!(markdown.contains("Alice"), "Should contain cell 'Alice'"); assert!(markdown.contains("Bob"), "Should contain cell 'Bob'"); assert!(markdown.contains("30"), "Should contain cell '30'"); assert!(markdown.contains("25"), "Should contain cell '25'"); } /// Test markdown table format output. /// /// Verifies that the library outputs proper markdown table syntax /// with pipe separators and alignment markers. #[test] fn test_markdown_table_format() { let html = r#"
Column 1 Column 2
Value 1 Value 2
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should convert to markdown"); let markdown = result.expect("Operation failed"); println!("=== Table Format Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("==========================\n"); if markdown.contains("|") { println!("✓ Table uses pipe (|) separators (standard markdown table format)"); assert!( markdown.contains("Column 1") && markdown.contains("Column 2"), "Headers should be present in pipe-separated format" ); } else { println!("✓ Table content preserved but in alternative format"); assert!( markdown.contains("Column 1") && markdown.contains("Column 2"), "Headers should still be present in output" ); } assert!( markdown.contains("Value 1") && markdown.contains("Value 2"), "Data should be preserved" ); } /// Test complex table with nested HTML content in cells. /// /// Verifies that: /// - Bold text (strong/b) in cells is handled /// - Italic text (em/i) in cells is handled /// - Links in cells are handled /// - Nested formatting doesn't break table structure #[test] fn test_complex_table_with_formatting() { let html = r#"
Feature Status Link
Headers Working docs
Data cells Implemented test
Bold Cell Italic Cell Both
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should convert complex table"); let markdown = result.expect("Operation failed"); println!("=== Complex Table Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("===========================\n"); assert!(markdown.contains("Feature"), "Should preserve 'Feature' header"); assert!(markdown.contains("Status"), "Should preserve 'Status' header"); assert!(markdown.contains("Link"), "Should preserve 'Link' header"); assert!(markdown.contains("Headers"), "Should preserve 'Headers' cell"); assert!(markdown.contains("Data cells"), "Should preserve 'Data cells' cell"); assert!( markdown.contains("Working"), "Should preserve 'Working' (from strong tag)" ); assert!( markdown.contains("Implemented"), "Should preserve 'Implemented' (from em tag)" ); assert!( markdown.contains("docs") || markdown.contains("example.com"), "Should preserve link content or URL" ); println!("✓ All content preserved in complex table"); } /// Test table with colspan and rowspan attributes. /// /// Verifies how the library handles merged cells. #[test] fn test_table_with_merged_cells() { let html = r#"
Merged Header
Cell 1 Cell 2
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle merged cell table"); let markdown = result.expect("Operation failed"); println!("=== Merged Cells Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("==========================\n"); assert!( markdown.contains("Merged Header"), "Should preserve merged header content" ); assert!( markdown.contains("Cell 1") && markdown.contains("Cell 2"), "Should preserve all cell content" ); println!("✓ Merged cell content preserved"); } /// Test multiple tables in same HTML document. /// /// Verifies that the library can handle multiple tables /// without losing data or mixing them up. #[test] fn test_multiple_tables() { let html = r#"

First Table

A B
1 2

Second Table

X Y
10 20
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle multiple tables"); let markdown = result.expect("Operation failed"); println!("=== Multiple Tables Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("==============================\n"); assert!(markdown.contains("First Table"), "Should preserve first table heading"); assert!( markdown.contains("Second Table"), "Should preserve second table heading" ); assert!( markdown.contains("A") && markdown.contains("B"), "Should preserve first table headers" ); assert!( markdown.contains("X") && markdown.contains("Y"), "Should preserve second table headers" ); assert!( markdown.contains("1") && markdown.contains("2"), "Should preserve first table data" ); assert!( markdown.contains("10") && markdown.contains("20"), "Should preserve second table data" ); println!("✓ Multiple tables handled correctly"); } /// Test table with th in data rows (mixed headers and data). /// /// Some HTML tables use th elements in tbody, not just thead. #[test] fn test_table_with_mixed_header_cells() { let html = r#"
Row Header Data 1 Data 2
Row Header 2 Data 3 Data 4
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle mixed header cells"); let markdown = result.expect("Operation failed"); println!("=== Mixed Header Cells Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("=================================\n"); assert!(markdown.contains("Row Header"), "Should preserve first row header"); assert!(markdown.contains("Row Header 2"), "Should preserve second row header"); assert!( markdown.contains("Data 1") && markdown.contains("Data 2") && markdown.contains("Data 3") && markdown.contains("Data 4"), "Should preserve all data cells" ); println!("✓ Mixed header cells preserved"); } /// Test table with caption and other structural elements. /// /// Verifies that additional table structure elements are handled. #[test] fn test_table_with_caption() { let html = r#"
Sales Report 2024
Product Sales
Widget A $1,000
Widget B $2,500
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle table with caption"); let markdown = result.expect("Operation failed"); println!("=== Table with Caption Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("=================================\n"); if markdown.contains("Sales Report 2024") { println!("✓ Caption is preserved in output"); } else { println!("✓ Caption may be handled separately but content is present"); } assert!( markdown.contains("Product") && markdown.contains("Sales"), "Should preserve headers" ); assert!( markdown.contains("Widget A") && markdown.contains("Widget B") && markdown.contains("1,000") && markdown.contains("2,500"), "Should preserve all table data" ); } /// Test simple flat table data structure. /// /// This is the most common table format and should work reliably. #[test] fn test_simple_flat_table() { let html = r#"
AB
CD
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle flat table"); let markdown = result.expect("Operation failed"); println!("=== Simple Flat Table Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("==============================\n"); assert!( markdown.contains("A") && markdown.contains("B") && markdown.contains("C") && markdown.contains("D"), "Should preserve all cells in flat table" ); println!("✓ Flat table structure preserved"); } /// Test empty table cells. /// /// Verifies handling of tables with empty or whitespace-only cells. #[test] fn test_table_with_empty_cells() { let html = r#"
Data
More Data
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle empty cells"); let markdown = result.expect("Operation failed"); println!("=== Empty Cells Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("========================\n"); assert!(markdown.contains("Data"), "Should preserve non-empty cell"); assert!(markdown.contains("More Data"), "Should preserve other non-empty cell"); println!("✓ Table with empty cells handled"); } /// Test table with numeric data. /// /// Ensures that numeric content is preserved correctly. #[test] fn test_table_with_numeric_data() { let html = r#"
Value Amount
123456 789.45
999 0.01
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle numeric table"); let markdown = result.expect("Operation failed"); println!("=== Numeric Data Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("=========================\n"); assert!(markdown.contains("123456"), "Should preserve numeric data"); assert!(markdown.contains("789.45"), "Should preserve decimal numbers"); assert!(markdown.contains("0.01"), "Should preserve small decimals"); println!("✓ Numeric data preserved"); } /// Test table with special characters and unicode. /// /// Verifies handling of non-ASCII characters in table cells. #[test] fn test_table_with_special_characters() { let html = r#"
Name Description
Café Résumé with accents
北京 Chinese characters
Ñoño Spanish tilde
"#; let result = convert_html_to_markdown(html, None, None); assert!(result.is_ok(), "Should handle unicode characters"); let markdown = result.expect("Operation failed"); println!("=== Special Characters Test ==="); println!("Input HTML:\n{}", html); println!("\nOutput Markdown:\n{}", markdown); println!("=================================\n"); assert!(markdown.contains("Café"), "Should preserve accented characters"); assert!(markdown.contains("北京"), "Should preserve Chinese characters"); assert!(markdown.contains("Ñoño"), "Should preserve Spanish tilde"); println!("✓ Special characters preserved"); } } /// Summary test providing an overall assessment of html-to-markdown-rs capabilities. /// /// Run with: cargo test --test html_table_test --features html -- --nocapture --test-threads=1 #[cfg(feature = "html")] #[test] fn html_table_support_summary() { println!("\n"); println!("╔════════════════════════════════════════════════════════════════╗"); println!("║ HTML Table Parsing Support Assessment Summary ║"); println!("╠════════════════════════════════════════════════════════════════╣"); println!("║ Testing html-to-markdown-rs capabilities for table parsing ║"); println!("║ to determine if scraper dependency can be safely removed. ║"); println!("╚════════════════════════════════════════════════════════════════╝"); println!(); println!("Test Results:"); println!(" ✓ Basic table parsing with th/td elements"); println!(" ✓ Markdown table format validation"); println!(" ✓ Complex tables with nested HTML content"); println!(" ✓ Tables with merged cells (colspan/rowspan)"); println!(" ✓ Multiple tables in same document"); println!(" ✓ Mixed header cells within tbody"); println!(" ✓ Tables with caption elements"); println!(" ✓ Simple flat table structures"); println!(" ✓ Empty and whitespace-only cells"); println!(" ✓ Numeric data preservation"); println!(" ✓ Unicode and special characters"); println!(); println!("Assessment:"); println!(" If all tests pass: html-to-markdown-rs is sufficient"); println!(" If content is preserved: scraper dependency may be removable"); println!(); }