//! CSV and spreadsheet integration tests. //! //! Tests for CSV and TSV extraction. //! Validates data extraction, custom delimiters, quoted fields, and edge cases. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; mod helpers; /// Test basic CSV extraction - simple comma-separated values. #[tokio::test] async fn test_csv_basic_extraction() { let config = ExtractionConfig::default(); let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert_eq!(extraction.mime_type, "text/csv"); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Name"), "Should contain 'Name' header"); assert!(extraction.content.contains("Age"), "Should contain 'Age' header"); assert!(extraction.content.contains("City"), "Should contain 'City' header"); assert!(extraction.content.contains("Alice"), "Should contain Alice row"); assert!(extraction.content.contains("30"), "Should contain Alice's age"); assert!(extraction.content.contains("NYC"), "Should contain Alice's city"); assert!(extraction.content.contains("Bob"), "Should contain Bob row"); assert!(extraction.content.contains("25"), "Should contain Bob's age"); assert!(extraction.content.contains("LA"), "Should contain Bob's city"); } /// Test CSV with headers - first row as headers. #[tokio::test] async fn test_csv_with_headers() { let config = ExtractionConfig::default(); let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Product"), "Should contain Product header"); assert!(extraction.content.contains("Price"), "Should contain Price header"); assert!( extraction.content.contains("Quantity"), "Should contain Quantity header" ); assert!( extraction.content.contains("Apple") && extraction.content.contains("1.50") && extraction.content.contains("100") ); assert!( extraction.content.contains("Banana") && extraction.content.contains("0.75") && extraction.content.contains("200") ); assert!( extraction.content.contains("Orange") && extraction.content.contains("2.00") && extraction.content.contains("150") ); } /// Test CSV with custom delimiter - tab and semicolon. #[tokio::test] async fn test_csv_custom_delimiter() { let config = ExtractionConfig::default(); let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(!extraction.content.is_empty(), "Content should be extracted"); assert!(extraction.content.contains("Alice"), "Should contain Alice"); assert!(extraction.content.contains("30"), "Should contain age"); assert!(extraction.content.contains("NYC"), "Should contain city"); } /// Test TSV (Tab-Separated Values) file. #[tokio::test] async fn test_tsv_file() { let config = ExtractionConfig::default(); let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA"; let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: TSV extraction not available"); return; } }; assert_eq!(extraction.mime_type, "text/tab-separated-values"); assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Name"), "Should contain Name header"); assert!(extraction.content.contains("Age"), "Should contain Age header"); assert!(extraction.content.contains("City"), "Should contain City header"); assert!(extraction.content.contains("Alice"), "Should contain Alice"); assert!(extraction.content.contains("Bob"), "Should contain Bob"); assert!(extraction.content.contains("30") && extraction.content.contains("NYC")); assert!(extraction.content.contains("25") && extraction.content.contains("LA")); } /// Test CSV with quoted fields - fields containing commas. #[tokio::test] async fn test_csv_quoted_fields() { let config = ExtractionConfig::default(); let csv_content = b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Smith"), "Should contain Smith"); assert!(extraction.content.contains("John"), "Should contain John"); assert!(extraction.content.contains("Doe"), "Should contain Doe"); assert!(extraction.content.contains("Jane"), "Should contain Jane"); assert!(extraction.content.contains("Product A") || extraction.content.contains("premium")); assert!(extraction.content.contains("Product B") || extraction.content.contains("standard")); assert!(extraction.content.contains("100") && extraction.content.contains("50")); } /// Test CSV with special characters - Unicode, newlines in fields. #[tokio::test] async fn test_csv_special_characters() { let config = ExtractionConfig::default(); let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes(); let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(!extraction.content.is_empty(), "Special characters should be handled"); assert!(extraction.content.contains("Alice"), "Should contain Alice"); assert!(extraction.content.contains("Bob"), "Should contain Bob"); assert!(extraction.content.contains("Carlos"), "Should contain Carlos"); assert!(extraction.content.contains("Tokyo") || extraction.content.contains("東京")); assert!(extraction.content.contains("París") || extraction.content.contains("Paris")); } /// Test CSV with large file - 10,000+ rows (streaming). #[tokio::test] async fn test_csv_large_file() { let config = ExtractionConfig::default(); let mut csv_content = "ID,Name,Value\n".to_string(); for i in 1..=10_000 { csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10)); } let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(!extraction.content.is_empty(), "Large CSV should be processed"); assert!( extraction.content.len() > 1000, "Large CSV content should be substantial" ); assert!(extraction.content.contains("Item1") || extraction.content.contains("10.00")); assert!(extraction.content.contains("Item5000") || extraction.content.contains("50000.00")); assert!(extraction.content.contains("Item10000") || extraction.content.contains("100000.00")); } /// Test malformed CSV - inconsistent columns. #[tokio::test] async fn test_csv_malformed() { let config = ExtractionConfig::default(); let csv_content = b"Name,Age,City\nAlice,30\nBob,25,LA,Extra\nCarlos,35,SF"; let result = extract_bytes(csv_content, "text/csv", &config).await; assert!( result.is_ok() || result.is_err(), "Should handle malformed CSV gracefully" ); if let Ok(extraction) = result { assert!(!extraction.content.is_empty()); } } /// Test empty CSV file. #[tokio::test] async fn test_csv_empty() { let config = ExtractionConfig::default(); let empty_csv = b""; let result = extract_bytes(empty_csv, "text/csv", &config).await; assert!(result.is_ok() || result.is_err(), "Should handle empty CSV gracefully"); } /// Test CSV with only headers. #[tokio::test] async fn test_csv_headers_only() { let config = ExtractionConfig::default(); let csv_content = b"Name,Age,City"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!( extraction.content.contains("Name") || !extraction.content.is_empty(), "Headers should be extracted" ); } /// Test CSV with blank lines. #[tokio::test] async fn test_csv_blank_lines() { let config = ExtractionConfig::default(); let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Alice") || extraction.content.contains("Bob")); } /// Test CSV with numeric data. #[tokio::test] async fn test_csv_numeric_data() { let config = ExtractionConfig::default(); let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10"; let extraction = match extract_bytes(csv_content, "text/csv", &config).await { Ok(result) => result, Err(_) => { println!("Skipping test: CSV extraction not available"); return; } }; assert!( extraction.chunks.is_none(), "Chunks should be None without chunking config" ); assert!( extraction.detected_languages.is_none(), "Language detection not enabled" ); assert!(!extraction.tables.is_empty(), "CSV should produce table structures"); assert_eq!(extraction.tables.len(), 1, "CSV should have one table"); assert!(!extraction.tables[0].cells.is_empty(), "Table should have rows"); assert!( !extraction.tables[0].markdown.is_empty(), "Table should have markdown representation" ); assert!(extraction.content.contains("Price"), "Should contain Price header"); assert!( extraction.content.contains("Quantity"), "Should contain Quantity header" ); assert!( extraction.content.contains("Discount"), "Should contain Discount header" ); assert!(extraction.content.contains("19.99"), "Should contain first price"); assert!(extraction.content.contains("100"), "Should contain first quantity"); assert!(extraction.content.contains("0.15"), "Should contain first discount"); assert!(extraction.content.contains("29.99"), "Should contain second price"); assert!(extraction.content.contains("50"), "Should contain second quantity"); assert!(extraction.content.contains("9.99"), "Should contain third price"); assert!(extraction.content.contains("200"), "Should contain third quantity"); }