// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef //! E2e tests for category: smoke use kreuzberg::{extract_bytes, extract_file}; #[tokio::test] async fn test_ocr_image_png() { // OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge. let content = std::fs::read(concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/images/test_hello_world.png" )) .expect("test_documents/images/test_hello_world.png must exist"); let mime_type = r#"image/png"#; let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_bytes(&content, mime_type, &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"image/png"#, "equals assertion failed" ); assert!( result.content.len() >= 1, "expected length >= 1, got {}", result.content.len() ); assert!( result.content.contains(r#"Hello"#) || result.content.contains(r#"World"#) || result.content.contains(r#"hello"#) || result.content.contains(r#"world"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_smoke_docx_basic() { // Smoke test: DOCX with formatted text let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx"); let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#, "equals assertion failed" ); assert!( result.content.len() >= 20, "expected length >= 20, got {}", result.content.len() ); assert!( result.content.contains(r#"Lorem"#) || result.content.contains(r#"ipsum"#) || result.content.contains(r#"document"#) || result.content.contains(r#"text"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_smoke_html_basic() { // Smoke test: HTML table extraction let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "html/simple_table.html" ); let mime_type = Some(r#"text/html"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"text/html"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"Sample Data Table"#) || result.content.contains(r#"Laptop"#) || result.content.contains(r#"Electronics"#) || result.content.contains(r#"Product"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_smoke_image_png() { // Smoke test: PNG image (without OCR, metadata only) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "images/sample.png" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"image/png"#, "equals assertion failed" ); } #[tokio::test] async fn test_smoke_json_basic() { // Smoke test: JSON file extraction let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "json/simple.json"); let mime_type = Some(r#"application/json"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/json"#, "equals assertion failed" ); assert!( result.content.len() >= 5, "expected length >= 5, got {}", result.content.len() ); } #[tokio::test] async fn test_smoke_pdf_basic() { // Smoke test: PDF with simple text extraction let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type = Some(r#"application/pdf"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 50, "expected length >= 50, got {}", result.content.len() ); assert!( result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"To Whom it May Concern"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_smoke_txt_basic() { // Smoke test: Plain text file let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "text/report.txt"); let mime_type = Some(r#"text/plain"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"text/plain"#, "equals assertion failed" ); assert!( result.content.len() >= 5, "expected length >= 5, got {}", result.content.len() ); } #[tokio::test] async fn test_smoke_xlsx_basic() { // Smoke test: XLSX with basic spreadsheet data including tables let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "xlsx/stanley_cups.xlsx" ); let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#); let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#, "equals assertion failed" ); assert!( result.content.len() >= 100, "expected length >= 100, got {}", result.content.len() ); assert!( format!("{:?}", result.content).contains(r#"Team"#), "expected to contain: {}", r#"Team"# ); assert!( format!("{:?}", result.content).contains(r#"Location"#), "expected to contain: {}", r#"Location"# ); assert!( format!("{:?}", result.content).contains(r#"Stanley Cups"#), "expected to contain: {}", r#"Stanley Cups"# ); assert!( format!("{:?}", result.content).contains(r#"Blues"#), "expected to contain: {}", r#"Blues"# ); assert!( format!("{:?}", result.content).contains(r#"Flyers"#), "expected to contain: {}", r#"Flyers"# ); assert!( format!("{:?}", result.content).contains(r#"Maple Leafs"#), "expected to contain: {}", r#"Maple Leafs"# ); assert!( format!("{:?}", result.content).contains(r#"STL"#), "expected to contain: {}", r#"STL"# ); assert!( format!("{:?}", result.content).contains(r#"PHI"#), "expected to contain: {}", r#"PHI"# ); assert!( format!("{:?}", result.content).contains(r#"TOR"#), "expected to contain: {}", r#"TOR"# ); // skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type }