//! Email extraction integration tests. //! //! Tests for .eml (RFC822) email extraction. //! Validates metadata extraction, content extraction, HTML/plain text handling, and attachments. #![cfg(feature = "email")] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; mod helpers; /// Test basic EML extraction with subject, from, to, and body. #[tokio::test] async fn test_eml_basic_extraction() { let config = ExtractionConfig::default(); let eml_content = b"From: sender@example.com\r\n\ To: recipient@example.com\r\n\ Subject: Test Email Subject\r\n\ Date: Mon, 1 Jan 2024 12:00:00 +0000\r\n\ Message-ID: \r\n\ \r\n\ This is the email body content."; let result = extract_bytes(eml_content, "message/rfc822", &config) .await .expect("Should extract EML successfully"); assert_eq!(result.mime_type, "message/rfc822"); assert_eq!(result.metadata.subject, Some("Test Email Subject".to_string())); assert!(result.metadata.format.is_some()); let email_meta = match result.metadata.format.as_ref().expect("Operation failed") { kreuzberg::FormatMetadata::Email(meta) => meta, _ => panic!("Expected Email metadata"), }; assert_eq!(email_meta.from_email, Some("sender@example.com".to_string())); assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]); assert!(email_meta.cc_emails.is_empty(), "CC should be empty"); assert!(email_meta.bcc_emails.is_empty(), "BCC should be empty"); assert!(email_meta.message_id.is_some()); let msg_id = email_meta.message_id.clone().expect("Operation failed"); assert!( msg_id.contains("unique123@example.com"), "Message ID should contain unique123@example.com" ); assert!(email_meta.attachments.is_empty(), "Should have no attachments"); assert!(result.metadata.created_at.is_some()); assert!(result.content.contains("Subject: Test Email Subject")); assert!(result.content.contains("From: sender@example.com")); assert!(result.content.contains("To: recipient@example.com")); assert!(result.content.contains("This is the email body content")); } /// Test EML with attachments - metadata extraction. #[tokio::test] async fn test_eml_with_attachments() { let config = ExtractionConfig::default(); let eml_content = b"From: sender@example.com\r\n\ To: recipient@example.com\r\n\ Subject: Email with Attachment\r\n\ Content-Type: multipart/mixed; boundary=\"----boundary\"\r\n\ \r\n\ ------boundary\r\n\ Content-Type: text/plain\r\n\ \r\n\ Email body text.\r\n\ ------boundary\r\n\ Content-Type: text/plain; name=\"file.txt\"\r\n\ Content-Disposition: attachment; filename=\"file.txt\"\r\n\ \r\n\ Attachment content here.\r\n\ ------boundary--\r\n"; let result = extract_bytes(eml_content, "message/rfc822", &config) .await .expect("Should extract EML with attachment"); assert!(result.metadata.format.is_some()); let email_meta = match result.metadata.format.as_ref().expect("Operation failed") { kreuzberg::FormatMetadata::Email(meta) => meta, _ => panic!("Expected Email metadata"), }; if !email_meta.attachments.is_empty() { assert!(result.content.contains("Attachments:")); } assert!(result.content.contains("Email body text") || result.content.contains("Attachment content")); } /// Test EML with HTML body. #[tokio::test] async fn test_eml_html_body() { let config = ExtractionConfig::default(); let eml_content = b"From: sender@example.com\r\n\ To: recipient@example.com\r\n\ Subject: HTML Email\r\n\ Content-Type: text/html; charset=utf-8\r\n\ \r\n\ \r\n\ \r\n\ \r\n\

HTML Heading

\r\n\

This is bold text in HTML.

\r\n\ \r\n\ \r\n\ "; let result = extract_bytes(eml_content, "message/rfc822", &config) .await .expect("Should extract HTML email"); assert!(!result.content.contains("