678 lines
22 KiB
Rust
678 lines
22 KiB
Rust
//! Comprehensive TDD test suite for ODT (OpenDocument Text) extraction
|
|
//!
|
|
//! This test suite validates ODT extraction capabilities using Pandoc's output as the baseline.
|
|
//! It covers:
|
|
//! - Metadata extraction (title, creator, date, keywords from meta.xml)
|
|
//! - Content extraction (text, formatting, structure)
|
|
//! - Table extraction with captions
|
|
//! - Formatting preservation (bold, italic, strikeout)
|
|
//! - Image handling with captions
|
|
//! - Math formula extraction
|
|
//! - Note handling (footnotes, endnotes)
|
|
//! - Citation/reference extraction
|
|
//! - Unicode and special character handling
|
|
//!
|
|
//! Note: These tests require the `office` feature to be enabled and Pandoc to be installed.
|
|
|
|
#![cfg(feature = "office")]
|
|
|
|
use kreuzberg::core::config::ExtractionConfig;
|
|
use kreuzberg::core::extractor::extract_file;
|
|
use std::path::{Path, PathBuf};
|
|
|
|
mod helpers;
|
|
|
|
/// Helper function to get the workspace root and construct test file paths
|
|
fn get_test_file_path(filename: &str) -> PathBuf {
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.parent()
|
|
.expect("Operation failed");
|
|
workspace_root.join(format!("test_documents/odt/{}", filename))
|
|
}
|
|
|
|
/// Helper to verify a test file exists before running test
|
|
fn ensure_test_file_exists(path: &Path) -> bool {
|
|
if !path.exists() {
|
|
println!("Skipping test: Test file not found at {:?}", path);
|
|
false
|
|
} else {
|
|
true
|
|
}
|
|
}
|
|
|
|
/// Tests extraction of document metadata from ODT meta.xml
|
|
/// Validates: title, subject, creator, dates, generator
|
|
#[tokio::test]
|
|
async fn test_odt_metadata_extraction() {
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.parent()
|
|
.expect("Operation failed");
|
|
let test_file = workspace_root.join("test_documents/odt/metadata_test.odt");
|
|
|
|
if !ensure_test_file_exists(&test_file) {
|
|
println!("Skipping metadata test: metadata_test.odt not found");
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract ODT metadata successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
assert!(
|
|
result.content.contains("Test Document"),
|
|
"Should contain document title in content"
|
|
);
|
|
|
|
let metadata = &result.metadata.additional;
|
|
println!("Extracted metadata: {:?}", metadata);
|
|
|
|
if let Some(title) = metadata.get("title") {
|
|
assert_eq!(title.as_str(), Some("Test Metadata Document"), "Title should match");
|
|
}
|
|
|
|
if let Some(subject) = metadata.get("subject") {
|
|
assert_eq!(
|
|
subject.as_str(),
|
|
Some("Testing ODT Metadata Extraction"),
|
|
"Subject should match"
|
|
);
|
|
}
|
|
|
|
if let Some(created_by) = metadata.get("created_by") {
|
|
assert_eq!(created_by.as_str(), Some("John Doe"), "Creator should match");
|
|
}
|
|
|
|
if let Some(authors) = metadata.get("authors") {
|
|
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
assert_eq!(authors_array.len(), 1, "Should have one author");
|
|
assert_eq!(authors_array[0].as_str(), Some("John Doe"), "Author name should match");
|
|
}
|
|
|
|
assert!(result.metadata.created_at.is_some(), "Creation date should be present");
|
|
|
|
assert!(
|
|
result.metadata.modified_at.is_some(),
|
|
"Modification date should be present"
|
|
);
|
|
|
|
if let Some(generator) = metadata.get("generator") {
|
|
let gen_str = generator.as_str().expect("Generator should be a string");
|
|
assert!(gen_str.contains("Pandoc"), "Generator should be Pandoc");
|
|
}
|
|
|
|
println!("✅ ODT metadata extraction test passed!");
|
|
println!(" Metadata fields extracted: {}", metadata.len());
|
|
}
|
|
|
|
/// Tests extraction of tables with captions from ODT
|
|
/// Baseline from Pandoc: simpleTableWithCaption.odt
|
|
/// Expected Pandoc output:
|
|
/// ```
|
|
/// --------- --------------
|
|
/// Content More content
|
|
/// --------- --------------
|
|
/// : Table 1: Some caption for a table
|
|
/// ```
|
|
#[tokio::test]
|
|
async fn test_odt_table_with_caption_extraction() {
|
|
let test_file = get_test_file_path("simpleTableWithCaption.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config).await;
|
|
|
|
if let Ok(result) = result {
|
|
if !result.content.is_empty() {
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("content") || content_lower.contains("table") || !result.tables.is_empty(),
|
|
"Should either extract table content or structured tables"
|
|
);
|
|
}
|
|
println!("✅ ODT table with caption extraction test passed!");
|
|
println!(" Extracted {} tables", result.tables.len());
|
|
} else {
|
|
println!("⚠️ ODT table extraction not fully supported yet (Pandoc integration needed)");
|
|
}
|
|
}
|
|
|
|
/// Tests extraction of basic tables without captions
|
|
/// Baseline from Pandoc: simpleTable.odt
|
|
/// Expected: Table with "Content" and "More content" cells
|
|
#[tokio::test]
|
|
async fn test_odt_simple_table_extraction() {
|
|
let test_file = get_test_file_path("simpleTable.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config).await;
|
|
|
|
if let Ok(result) = result {
|
|
if !result.content.is_empty() {
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("content") || !result.tables.is_empty(),
|
|
"Table should either contain 'content' text or be in structured tables"
|
|
);
|
|
}
|
|
println!("✅ ODT simple table extraction test passed!");
|
|
} else {
|
|
println!("⚠️ ODT table extraction not fully supported yet");
|
|
}
|
|
}
|
|
|
|
/// Tests extraction of document heading hierarchy
|
|
/// Baseline from Pandoc: headers.odt
|
|
/// Expected:
|
|
/// - H1: "A header (Lv 1)"
|
|
/// - H2: "Another header (Lv 2)"
|
|
/// - H1: "Back to Level 1"
|
|
#[tokio::test]
|
|
async fn test_odt_heading_structure_extraction() {
|
|
let test_file = get_test_file_path("headers.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract heading structure successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
assert!(
|
|
result.content.contains("header") || result.content.contains("Header"),
|
|
"Should contain heading text"
|
|
);
|
|
|
|
assert!(
|
|
result.content.contains("#") || result.content.contains("header"),
|
|
"Should indicate heading structure"
|
|
);
|
|
|
|
println!("✅ ODT heading structure extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of bold text formatting
|
|
/// Baseline from Pandoc: bold.odt
|
|
/// Expected Pandoc output: "Here comes **bold** text"
|
|
#[tokio::test]
|
|
async fn test_odt_bold_formatting_extraction() {
|
|
let test_file = get_test_file_path("bold.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract bold formatting successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content = result.content.to_lowercase();
|
|
assert!(content.contains("bold"), "Should contain 'bold' text");
|
|
|
|
assert!(
|
|
result.content.contains("**bold**") || result.content.contains("bold"),
|
|
"Should preserve bold text"
|
|
);
|
|
|
|
println!("✅ ODT bold formatting extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of italic text formatting
|
|
/// Baseline from Pandoc: italic.odt
|
|
/// Expected Pandoc output: "Here comes *italic* text"
|
|
#[tokio::test]
|
|
async fn test_odt_italic_formatting_extraction() {
|
|
let test_file = get_test_file_path("italic.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract italic formatting successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content = result.content.to_lowercase();
|
|
assert!(content.contains("italic"), "Should contain 'italic' text");
|
|
|
|
assert!(
|
|
result.content.contains("*italic*") || result.content.contains("italic"),
|
|
"Should preserve italic text"
|
|
);
|
|
|
|
println!("✅ ODT italic formatting extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of strikeout/strikethrough text formatting
|
|
/// Baseline from Pandoc: strikeout.odt
|
|
/// Expected Pandoc output: "Here comes text that was ~~striken out~~."
|
|
#[tokio::test]
|
|
async fn test_odt_strikeout_formatting_extraction() {
|
|
let test_file = get_test_file_path("strikeout.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract strikeout formatting successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content = result.content.to_lowercase();
|
|
assert!(
|
|
content.contains("strike") || content.contains("striken"),
|
|
"Should contain strikeout text"
|
|
);
|
|
|
|
println!("✅ ODT strikeout formatting extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of images with captions
|
|
/// Baseline from Pandoc: imageWithCaption.odt
|
|
/// Expected: Image reference with caption
|
|
/// Expected Pandoc output:
|
|
/// ```
|
|
/// 
|
|
/// {alt="Abbildung 1: Image caption" width="5.292cm" height="5.292cm"}
|
|
/// ```
|
|
#[tokio::test]
|
|
async fn test_odt_image_with_caption_extraction() {
|
|
let test_file = get_test_file_path("imageWithCaption.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config).await;
|
|
|
|
if let Ok(result) = result {
|
|
if !result.content.is_empty() {
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("image")
|
|
|| content_lower.contains("caption")
|
|
|| content_lower.contains("!")
|
|
|| result.images.is_some(),
|
|
"Should reference image or caption or have extracted images"
|
|
);
|
|
}
|
|
println!("✅ ODT image with caption extraction test passed!");
|
|
} else {
|
|
println!("⚠️ ODT image extraction not fully supported yet");
|
|
}
|
|
}
|
|
|
|
/// Tests extraction of mathematical formulas
|
|
/// Baseline from Pandoc: formula.odt
|
|
/// Expected Pandoc output: "$$E = {m \\cdot c^{2}}$$"
|
|
#[tokio::test]
|
|
async fn test_odt_formula_extraction() {
|
|
let test_file = get_test_file_path("formula.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract formula successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content = &result.content;
|
|
assert!(
|
|
content.contains("E") && (content.contains("m") || content.contains("$")),
|
|
"Should extract formula content"
|
|
);
|
|
|
|
println!("✅ ODT formula extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of footnotes
|
|
/// Baseline from Pandoc: footnote.odt
|
|
/// Expected Pandoc output:
|
|
/// ```
|
|
/// Some text[^1] with a footnote.
|
|
///
|
|
/// [^1]: Footnote text
|
|
/// ```
|
|
#[tokio::test]
|
|
async fn test_odt_footnote_extraction() {
|
|
let test_file = get_test_file_path("footnote.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract footnote successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("footnote") || content_lower.contains("[^"),
|
|
"Should extract footnote"
|
|
);
|
|
|
|
println!("✅ ODT footnote extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of endnotes
|
|
/// Baseline from Pandoc: endnote.odt
|
|
/// Expected: Endnote content with reference (similar to footnotes)
|
|
#[tokio::test]
|
|
async fn test_odt_endnote_extraction() {
|
|
let test_file = get_test_file_path("endnote.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract endnote successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("endnote") || content_lower.contains("[^"),
|
|
"Should extract endnote"
|
|
);
|
|
|
|
println!("✅ ODT endnote extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of citations and references
|
|
/// Baseline from Pandoc: citation.odt
|
|
/// Expected Pandoc output: "Some text[@Ex] with a citation."
|
|
#[tokio::test]
|
|
async fn test_odt_citation_extraction() {
|
|
let test_file = get_test_file_path("citation.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract citation successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("citation") || content_lower.contains("text") || content_lower.contains("@"),
|
|
"Should extract citation"
|
|
);
|
|
|
|
println!("✅ ODT citation extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of unicode characters and special symbols
|
|
/// Baseline from Pandoc: unicode.odt
|
|
/// Expected: Proper preservation of unicode characters
|
|
/// Expected Pandoc output: ""'çӨ©¼вбФШöɵ"
|
|
#[tokio::test]
|
|
async fn test_odt_unicode_extraction() {
|
|
let test_file = get_test_file_path("unicode.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract unicode successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
assert!(!result.content.is_empty(), "Should extract unicode content (not empty)");
|
|
|
|
println!("✅ ODT unicode extraction test passed!");
|
|
println!(" Extracted unicode content: {:?}", result.content);
|
|
}
|
|
|
|
/// Tests extraction of inline code formatting
|
|
/// Baseline from Pandoc: inlinedCode.odt
|
|
/// Expected Pandoc output: "Here comes `inlined code` text and `an another` one."
|
|
#[tokio::test]
|
|
async fn test_odt_inlined_code_extraction() {
|
|
let test_file = get_test_file_path("inlinedCode.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract inline code successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(
|
|
content_lower.contains("code") || content_lower.contains("`"),
|
|
"Should extract inline code"
|
|
);
|
|
|
|
println!("✅ ODT inline code extraction test passed!");
|
|
}
|
|
|
|
/// Tests extraction of paragraph structure and content
|
|
/// Baseline from Pandoc: paragraph.odt
|
|
/// Expected: Multiple paragraphs separated by blank lines
|
|
#[tokio::test]
|
|
async fn test_odt_paragraph_structure_extraction() {
|
|
let test_file = get_test_file_path("paragraph.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract paragraph structure successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_lower = result.content.to_lowercase();
|
|
assert!(content_lower.contains("paragraph"), "Should contain paragraph text");
|
|
|
|
let paragraph_count = result.content.split('\n').filter(|l| !l.is_empty()).count();
|
|
assert!(paragraph_count >= 2, "Should extract multiple paragraphs");
|
|
|
|
println!("✅ ODT paragraph structure extraction test passed!");
|
|
println!(" Extracted {} paragraph segments", paragraph_count);
|
|
}
|
|
|
|
/// Integration test: Verify ODT extraction works with standard API
|
|
#[tokio::test]
|
|
async fn test_odt_extraction_api_integration() {
|
|
let test_file = get_test_file_path("bold.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract via standard API");
|
|
|
|
assert!(!result.content.is_empty(), "Should have content");
|
|
assert_eq!(result.mime_type, "application/vnd.oasis.opendocument.text");
|
|
|
|
println!("✅ ODT extraction API integration test passed!");
|
|
}
|
|
|
|
/// Test error handling for non-existent files
|
|
#[tokio::test]
|
|
async fn test_odt_extraction_missing_file_handling() {
|
|
let test_file = get_test_file_path("nonexistent.odt");
|
|
let config = ExtractionConfig::default();
|
|
|
|
let result = extract_file(&test_file, None, &config).await;
|
|
|
|
assert!(result.is_err(), "Should return error for non-existent file");
|
|
|
|
println!("✅ ODT extraction error handling test passed!");
|
|
}
|
|
|
|
/// Test extraction from multiple representative files
|
|
#[tokio::test]
|
|
async fn test_odt_extraction_variety() {
|
|
let test_files = vec![
|
|
"bold.odt",
|
|
"italic.odt",
|
|
"headers.odt",
|
|
"simpleTable.odt",
|
|
"footnote.odt",
|
|
];
|
|
|
|
let config = ExtractionConfig::default();
|
|
let mut successful_extractions = 0;
|
|
|
|
for filename in &test_files {
|
|
let test_file = get_test_file_path(filename);
|
|
if !test_file.exists() {
|
|
continue;
|
|
}
|
|
|
|
if let Ok(result) = extract_file(&test_file, None, &config).await
|
|
&& !result.content.is_empty()
|
|
{
|
|
successful_extractions += 1;
|
|
}
|
|
}
|
|
|
|
assert!(
|
|
successful_extractions >= 3,
|
|
"Should successfully extract from at least 3 test files"
|
|
);
|
|
|
|
println!("✅ ODT extraction variety test passed!");
|
|
println!(
|
|
" Successfully extracted {} out of {} files",
|
|
successful_extractions,
|
|
test_files.len()
|
|
);
|
|
}
|
|
|
|
/// Test that ODT table extraction doesn't include duplicate cell content
|
|
/// This is a regression test for the bug where table cells were extracted twice:
|
|
/// once as markdown tables and once as raw cell text
|
|
#[tokio::test]
|
|
async fn test_odt_table_no_duplicate_content() {
|
|
let test_file = get_test_file_path("simpleTable.odt");
|
|
if !ensure_test_file_exists(&test_file) {
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig::default();
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract table successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
let content_count = result.content.matches("Content").count();
|
|
|
|
println!(" 'Content' appears {} times in output", content_count);
|
|
println!(" Content preview:\n{}", result.content);
|
|
|
|
assert!(
|
|
content_count <= 3,
|
|
"Content should not appear excessively, indicating no duplicate table cell extraction"
|
|
);
|
|
|
|
println!("✅ ODT table no duplicate content test passed!");
|
|
}
|
|
|
|
/// Test comprehensive table extraction with headers, multiple rows, and tables
|
|
/// Uses the extraction_test document created with pandoc to ensure complete content
|
|
#[tokio::test]
|
|
async fn test_odt_comprehensive_table_extraction() {
|
|
let test_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.join("test_documents/odt/extraction_test.odt");
|
|
|
|
if !test_file.exists() {
|
|
println!("⚠️ Test document not found at {:?}, skipping", test_file);
|
|
return;
|
|
}
|
|
|
|
let config = ExtractionConfig {
|
|
output_format: kreuzberg::core::config::OutputFormat::Markdown,
|
|
..Default::default()
|
|
};
|
|
let result = extract_file(&test_file, None, &config)
|
|
.await
|
|
.expect("Should extract comprehensive table document successfully");
|
|
|
|
assert!(!result.content.is_empty(), "Content should not be empty");
|
|
|
|
assert!(result.content.contains("Comprehensive"), "Should contain heading");
|
|
assert!(
|
|
result.content.contains("First Section") || result.content.contains("First"),
|
|
"Should contain first section"
|
|
);
|
|
assert!(
|
|
result.content.contains("Second Section") || result.content.contains("Second"),
|
|
"Should contain second section"
|
|
);
|
|
assert!(
|
|
result.content.contains("Third Section") || result.content.contains("Third"),
|
|
"Should contain third section"
|
|
);
|
|
|
|
assert!(
|
|
result.content.contains("|"),
|
|
"Should contain pipe characters for markdown tables"
|
|
);
|
|
assert!(result.content.contains("---"), "Should contain table separator");
|
|
|
|
assert!(
|
|
result.content.contains("Header 1") || result.content.contains("Cell 1A"),
|
|
"Should contain table data"
|
|
);
|
|
assert!(
|
|
result.content.contains("Product") || result.content.contains("Apple"),
|
|
"Should contain second table data"
|
|
);
|
|
|
|
let cell_count = result.content.matches("Cell 1A").count();
|
|
assert!(
|
|
cell_count <= 2,
|
|
"Cell content should not be heavily duplicated (found {} instances)",
|
|
cell_count
|
|
);
|
|
|
|
println!("✅ ODT comprehensive table extraction test passed!");
|
|
println!(" Extracted content length: {} chars", result.content.len());
|
|
println!(" Tables found in output: {}", result.tables.len());
|
|
}
|