475 lines
16 KiB
Rust
475 lines
16 KiB
Rust
|
|
//! CSV and spreadsheet integration tests.
|
||
|
|
//!
|
||
|
|
//! Tests for CSV and TSV extraction.
|
||
|
|
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
||
|
|
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use kreuzberg::core::extractor::extract_bytes;
|
||
|
|
|
||
|
|
mod helpers;
|
||
|
|
|
||
|
|
/// Test basic CSV extraction - simple comma-separated values.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_basic_extraction() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert_eq!(extraction.mime_type, "text/csv");
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Name"), "Should contain 'Name' header");
|
||
|
|
assert!(extraction.content.contains("Age"), "Should contain 'Age' header");
|
||
|
|
assert!(extraction.content.contains("City"), "Should contain 'City' header");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Alice"), "Should contain Alice row");
|
||
|
|
assert!(extraction.content.contains("30"), "Should contain Alice's age");
|
||
|
|
assert!(extraction.content.contains("NYC"), "Should contain Alice's city");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Bob"), "Should contain Bob row");
|
||
|
|
assert!(extraction.content.contains("25"), "Should contain Bob's age");
|
||
|
|
assert!(extraction.content.contains("LA"), "Should contain Bob's city");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with headers - first row as headers.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_with_headers() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Product"), "Should contain Product header");
|
||
|
|
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Quantity"),
|
||
|
|
"Should contain Quantity header"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Apple")
|
||
|
|
&& extraction.content.contains("1.50")
|
||
|
|
&& extraction.content.contains("100")
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Banana")
|
||
|
|
&& extraction.content.contains("0.75")
|
||
|
|
&& extraction.content.contains("200")
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Orange")
|
||
|
|
&& extraction.content.contains("2.00")
|
||
|
|
&& extraction.content.contains("150")
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with custom delimiter - tab and semicolon.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_custom_delimiter() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(!extraction.content.is_empty(), "Content should be extracted");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
||
|
|
assert!(extraction.content.contains("30"), "Should contain age");
|
||
|
|
assert!(extraction.content.contains("NYC"), "Should contain city");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test TSV (Tab-Separated Values) file.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_tsv_file() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: TSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Name"), "Should contain Name header");
|
||
|
|
assert!(extraction.content.contains("Age"), "Should contain Age header");
|
||
|
|
assert!(extraction.content.contains("City"), "Should contain City header");
|
||
|
|
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
||
|
|
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
||
|
|
assert!(extraction.content.contains("30") && extraction.content.contains("NYC"));
|
||
|
|
assert!(extraction.content.contains("25") && extraction.content.contains("LA"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with quoted fields - fields containing commas.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_quoted_fields() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content =
|
||
|
|
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Smith"), "Should contain Smith");
|
||
|
|
assert!(extraction.content.contains("John"), "Should contain John");
|
||
|
|
assert!(extraction.content.contains("Doe"), "Should contain Doe");
|
||
|
|
assert!(extraction.content.contains("Jane"), "Should contain Jane");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Product A") || extraction.content.contains("premium"));
|
||
|
|
assert!(extraction.content.contains("Product B") || extraction.content.contains("standard"));
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("100") && extraction.content.contains("50"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with special characters - Unicode, newlines in fields.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_special_characters() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(!extraction.content.is_empty(), "Special characters should be handled");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
||
|
|
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
||
|
|
assert!(extraction.content.contains("Carlos"), "Should contain Carlos");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Tokyo") || extraction.content.contains("東京"));
|
||
|
|
assert!(extraction.content.contains("París") || extraction.content.contains("Paris"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with large file - 10,000+ rows (streaming).
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_large_file() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let mut csv_content = "ID,Name,Value\n".to_string();
|
||
|
|
for i in 1..=10_000 {
|
||
|
|
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
||
|
|
}
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(!extraction.content.is_empty(), "Large CSV should be processed");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.content.len() > 1000,
|
||
|
|
"Large CSV content should be substantial"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Item1") || extraction.content.contains("10.00"));
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Item5000") || extraction.content.contains("50000.00"));
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Item10000") || extraction.content.contains("100000.00"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test malformed CSV - inconsistent columns.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_malformed() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Name,Age,City\nAlice,30\nBob,25,LA,Extra\nCarlos,35,SF";
|
||
|
|
|
||
|
|
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
result.is_ok() || result.is_err(),
|
||
|
|
"Should handle malformed CSV gracefully"
|
||
|
|
);
|
||
|
|
|
||
|
|
if let Ok(extraction) = result {
|
||
|
|
assert!(!extraction.content.is_empty());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test empty CSV file.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_empty() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let empty_csv = b"";
|
||
|
|
|
||
|
|
let result = extract_bytes(empty_csv, "text/csv", &config).await;
|
||
|
|
|
||
|
|
assert!(result.is_ok() || result.is_err(), "Should handle empty CSV gracefully");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with only headers.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_headers_only() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Name,Age,City";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Name") || !extraction.content.is_empty(),
|
||
|
|
"Headers should be extracted"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with blank lines.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_blank_lines() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Alice") || extraction.content.contains("Bob"));
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Test CSV with numeric data.
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_csv_numeric_data() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
|
||
|
|
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
||
|
|
|
||
|
|
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
||
|
|
Ok(result) => result,
|
||
|
|
Err(_) => {
|
||
|
|
println!("Skipping test: CSV extraction not available");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
extraction.chunks.is_none(),
|
||
|
|
"Chunks should be None without chunking config"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.detected_languages.is_none(),
|
||
|
|
"Language detection not enabled"
|
||
|
|
);
|
||
|
|
assert!(!extraction.tables.is_empty(), "CSV should produce table structures");
|
||
|
|
assert_eq!(extraction.tables.len(), 1, "CSV should have one table");
|
||
|
|
assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows");
|
||
|
|
assert!(
|
||
|
|
!extraction.tables[0].markdown.is_empty(),
|
||
|
|
"Table should have markdown representation"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Quantity"),
|
||
|
|
"Should contain Quantity header"
|
||
|
|
);
|
||
|
|
assert!(
|
||
|
|
extraction.content.contains("Discount"),
|
||
|
|
"Should contain Discount header"
|
||
|
|
);
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("19.99"), "Should contain first price");
|
||
|
|
assert!(extraction.content.contains("100"), "Should contain first quantity");
|
||
|
|
assert!(extraction.content.contains("0.15"), "Should contain first discount");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("29.99"), "Should contain second price");
|
||
|
|
assert!(extraction.content.contains("50"), "Should contain second quantity");
|
||
|
|
|
||
|
|
assert!(extraction.content.contains("9.99"), "Should contain third price");
|
||
|
|
assert!(extraction.content.contains("200"), "Should contain third quantity");
|
||
|
|
}
|