Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,359 @@
//! TDD tests for DOCX formatting, heading hierarchy, lists, and hyperlinks.
//!
//! These tests verify that DOCX extraction produces high-quality markdown output
//! with proper formatting preservation (bold, italic, underline, hyperlinks),
//! heading hierarchy, list rendering, and document structure.
#![cfg(feature = "office")]
mod helpers;
use helpers::{assert_non_empty_content, get_test_file_path};
use kreuzberg::extract_file;
use kreuzberg::{ExtractionConfig, OutputFormat};
// ---------------------------------------------------------------------------
// Formatting tests
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_bold_rendered_as_markdown() {
let path = get_test_file_path("docx/unit_test_formatting.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("**bold**"),
"Bold text should be wrapped in ** markers. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_italic_rendered_as_markdown() {
let path = get_test_file_path("docx/unit_test_formatting.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("*italic*") || result.content.contains("*Italic"),
"Italic text should be wrapped in * markers. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_hyperlink_rendered_as_markdown() {
let path = get_test_file_path("docx/unit_test_formatting.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("[hyperlink]("),
"Hyperlinks should be rendered as [text](url). Got:\n{}",
result.content
);
assert!(
result.content.contains("https://"),
"Hyperlink URLs should be resolved. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_mixed_formatting_on_same_line() {
let path = get_test_file_path("docx/unit_test_formatting.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
// The document has a line: "Normal italic bold underline and hyperlink on the same line"
// Where "italic" is italic, "bold" is bold, "underline" is underlined, "hyperlink" is a link
let content = &result.content;
assert!(
content.contains("Normal ") && content.contains("*italic*") && content.contains("**bold**"),
"Mixed formatting should be preserved inline. Got:\n{}",
content
);
}
// ---------------------------------------------------------------------------
// Heading hierarchy tests
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_title_rendered_as_h1() {
let path = get_test_file_path("docx/unit_test_headers.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("# Test Document"),
"Title style should be rendered as # heading. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_heading_hierarchy() {
let path = get_test_file_path("docx/unit_test_headers.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
let content = &result.content;
// Heading1 → # (outline_level 0 maps to h1, same as standard converters)
assert!(
content.contains("# Section 1"),
"Heading1 should be rendered as #. Got:\n{}",
content
);
// Heading2 → ##
assert!(
content.contains("## Section 1.1"),
"Heading2 should be rendered as ##. Got:\n{}",
content
);
// Heading3 → ###
assert!(
content.contains("### Section 1.2.3"),
"Heading3 should be rendered as ###. Got:\n{}",
content
);
}
#[tokio::test]
async fn test_docx_paragraphs_separated_by_blank_lines() {
let path = get_test_file_path("docx/unit_test_headers.docx");
if !path.exists() {
return;
}
let result = extract_file(&path, None, &ExtractionConfig::default())
.await
.expect("Should extract DOCX");
assert_non_empty_content(&result);
// Paragraphs should be separated by blank lines
assert!(
result.content.contains("Paragraph 1.1\n\nParagraph 1.2"),
"Paragraphs should be separated by blank lines. Got:\n{}",
result.content
);
}
// ---------------------------------------------------------------------------
// List tests
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_bullet_list_rendered() {
let path = get_test_file_path("docx/unit_test_lists.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("- List item 1"),
"Bullet lists should use '- ' prefix. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_numbered_list_rendered() {
let path = get_test_file_path("docx/unit_test_lists.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains("1. List item a"),
"Numbered lists should use 'N. ' prefix. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_nested_list_indentation() {
let path = get_test_file_path("docx/unit_test_lists.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
assert!(
result.content.contains(" - List item 1.1"),
"Nested lists should be indented with 2 spaces. Got:\n{}",
result.content
);
}
// ---------------------------------------------------------------------------
// Document structure tests
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_document_structure_populated() {
let path = get_test_file_path("docx/unit_test_headers.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
include_document_structure: true,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert!(
result.document.is_some(),
"DocumentStructure should be populated when include_document_structure=true"
);
let doc = result.document.as_ref().unwrap();
assert!(!doc.nodes.is_empty(), "DocumentStructure should have nodes");
}
// ---------------------------------------------------------------------------
// Table tests
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_tables_in_markdown_output() {
let path = get_test_file_path("docx/docx_tables.docx");
if !path.exists() {
return;
}
let config = ExtractionConfig {
output_format: OutputFormat::Markdown,
..Default::default()
};
let result = extract_file(&path, None, &config).await.expect("Should extract DOCX");
assert_non_empty_content(&result);
// Tables should be rendered as markdown tables with pipe separators
assert!(
result.content.contains('|'),
"Tables should be rendered as markdown tables with | separators. Got:\n{}",
result.content
);
// Should have header separator row
assert!(
result.content.contains("---"),
"Tables should have header separator row with ---. Got:\n{}",
result.content
);
}
#[tokio::test]
async fn test_docx_table_cell_formatting_preserved() {
let path = get_test_file_path("docx/tablecell.docx");
if !path.exists() {
return;
}
let result = extract_file(&path, None, &ExtractionConfig::default())
.await
.expect("Should extract DOCX");
assert_non_empty_content(&result);
// The tables field should have table data
assert!(
!result.tables.is_empty(),
"DOCX with tables should have tables in result"
);
}
// ---------------------------------------------------------------------------
// MIME type test
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_docx_produces_markdown_mime_type() {
let path = get_test_file_path("docx/unit_test_formatting.docx");
if !path.exists() {
return;
}
let result = extract_file(&path, None, &ExtractionConfig::default())
.await
.expect("Should extract DOCX");
assert_eq!(
result.mime_type.as_ref() as &str,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"DOCX extractor should preserve input MIME type"
);
}