//! TDD tests for DOCX formatting, heading hierarchy, lists, and hyperlinks. //! //! These tests verify that DOCX extraction produces high-quality markdown output //! with proper formatting preservation (bold, italic, underline, hyperlinks), //! heading hierarchy, list rendering, and document structure. #![cfg(feature = "office")] mod helpers; use helpers::{assert_non_empty_content, get_test_file_path}; use kreuzberg::extract_file; use kreuzberg::{ExtractionConfig, OutputFormat}; // --------------------------------------------------------------------------- // Formatting tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_bold_rendered_as_markdown() { let path = get_test_file_path("docx/unit_test_formatting.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("**bold**"), "Bold text should be wrapped in ** markers. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_italic_rendered_as_markdown() { let path = get_test_file_path("docx/unit_test_formatting.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("*italic*") || result.content.contains("*Italic"), "Italic text should be wrapped in * markers. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_hyperlink_rendered_as_markdown() { let path = get_test_file_path("docx/unit_test_formatting.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("[hyperlink]("), "Hyperlinks should be rendered as [text](url). Got:\n{}", result.content ); assert!( result.content.contains("https://"), "Hyperlink URLs should be resolved. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_mixed_formatting_on_same_line() { let path = get_test_file_path("docx/unit_test_formatting.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); // The document has a line: "Normal italic bold underline and hyperlink on the same line" // Where "italic" is italic, "bold" is bold, "underline" is underlined, "hyperlink" is a link let content = &result.content; assert!( content.contains("Normal ") && content.contains("*italic*") && content.contains("**bold**"), "Mixed formatting should be preserved inline. Got:\n{}", content ); } // --------------------------------------------------------------------------- // Heading hierarchy tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_title_rendered_as_h1() { let path = get_test_file_path("docx/unit_test_headers.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("# Test Document"), "Title style should be rendered as # heading. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_heading_hierarchy() { let path = get_test_file_path("docx/unit_test_headers.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); let content = &result.content; // Heading1 → # (outline_level 0 maps to h1, same as standard converters) assert!( content.contains("# Section 1"), "Heading1 should be rendered as #. Got:\n{}", content ); // Heading2 → ## assert!( content.contains("## Section 1.1"), "Heading2 should be rendered as ##. Got:\n{}", content ); // Heading3 → ### assert!( content.contains("### Section 1.2.3"), "Heading3 should be rendered as ###. Got:\n{}", content ); } #[tokio::test] async fn test_docx_paragraphs_separated_by_blank_lines() { let path = get_test_file_path("docx/unit_test_headers.docx"); if !path.exists() { return; } let result = extract_file(&path, None, &ExtractionConfig::default()) .await .expect("Should extract DOCX"); assert_non_empty_content(&result); // Paragraphs should be separated by blank lines assert!( result.content.contains("Paragraph 1.1\n\nParagraph 1.2"), "Paragraphs should be separated by blank lines. Got:\n{}", result.content ); } // --------------------------------------------------------------------------- // List tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_bullet_list_rendered() { let path = get_test_file_path("docx/unit_test_lists.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("- List item 1"), "Bullet lists should use '- ' prefix. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_numbered_list_rendered() { let path = get_test_file_path("docx/unit_test_lists.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains("1. List item a"), "Numbered lists should use 'N. ' prefix. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_nested_list_indentation() { let path = get_test_file_path("docx/unit_test_lists.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); assert!( result.content.contains(" - List item 1.1"), "Nested lists should be indented with 2 spaces. Got:\n{}", result.content ); } // --------------------------------------------------------------------------- // Document structure tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_document_structure_populated() { let path = get_test_file_path("docx/unit_test_headers.docx"); if !path.exists() { return; } let config = ExtractionConfig { include_document_structure: true, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert!( result.document.is_some(), "DocumentStructure should be populated when include_document_structure=true" ); let doc = result.document.as_ref().unwrap(); assert!(!doc.nodes.is_empty(), "DocumentStructure should have nodes"); } // --------------------------------------------------------------------------- // Table tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_tables_in_markdown_output() { let path = get_test_file_path("docx/docx_tables.docx"); if !path.exists() { return; } let config = ExtractionConfig { output_format: OutputFormat::Markdown, ..Default::default() }; let result = extract_file(&path, None, &config).await.expect("Should extract DOCX"); assert_non_empty_content(&result); // Tables should be rendered as markdown tables with pipe separators assert!( result.content.contains('|'), "Tables should be rendered as markdown tables with | separators. Got:\n{}", result.content ); // Should have header separator row assert!( result.content.contains("---"), "Tables should have header separator row with ---. Got:\n{}", result.content ); } #[tokio::test] async fn test_docx_table_cell_formatting_preserved() { let path = get_test_file_path("docx/tablecell.docx"); if !path.exists() { return; } let result = extract_file(&path, None, &ExtractionConfig::default()) .await .expect("Should extract DOCX"); assert_non_empty_content(&result); // The tables field should have table data assert!( !result.tables.is_empty(), "DOCX with tables should have tables in result" ); } // --------------------------------------------------------------------------- // MIME type test // --------------------------------------------------------------------------- #[tokio::test] async fn test_docx_produces_markdown_mime_type() { let path = get_test_file_path("docx/unit_test_formatting.docx"); if !path.exists() { return; } let result = extract_file(&path, None, &ExtractionConfig::default()) .await .expect("Should extract DOCX"); assert_eq!( result.mime_type.as_ref() as &str, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "DOCX extractor should preserve input MIME type" ); }