This commit is contained in:
272
e2e/rust/tests/smoke_test.rs
generated
Normal file
272
e2e/rust/tests/smoke_test.rs
generated
Normal file
@@ -0,0 +1,272 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: smoke
|
||||
|
||||
use kreuzberg::{extract_bytes, extract_file};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ocr_image_png() {
|
||||
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/images/test_hello_world.png"
|
||||
))
|
||||
.expect("test_documents/images/test_hello_world.png must exist");
|
||||
let mime_type = r#"image/png"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes(&content, mime_type, &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"image/png"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 1,
|
||||
"expected length >= 1, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Hello"#)
|
||||
|| result.content.contains(r#"World"#)
|
||||
|| result.content.contains(r#"hello"#)
|
||||
|| result.content.contains(r#"world"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_docx_basic() {
|
||||
// Smoke test: DOCX with formatted text
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 20,
|
||||
"expected length >= 20, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Lorem"#)
|
||||
|| result.content.contains(r#"ipsum"#)
|
||||
|| result.content.contains(r#"document"#)
|
||||
|| result.content.contains(r#"text"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_html_basic() {
|
||||
// Smoke test: HTML table extraction
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"html/simple_table.html"
|
||||
);
|
||||
let mime_type = Some(r#"text/html"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/html"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Sample Data Table"#)
|
||||
|| result.content.contains(r#"Laptop"#)
|
||||
|| result.content.contains(r#"Electronics"#)
|
||||
|| result.content.contains(r#"Product"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_image_png() {
|
||||
// Smoke test: PNG image (without OCR, metadata only)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"images/sample.png"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"image/png"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_json_basic() {
|
||||
// Smoke test: JSON file extraction
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "json/simple.json");
|
||||
let mime_type = Some(r#"application/json"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/json"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 5,
|
||||
"expected length >= 5, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_pdf_basic() {
|
||||
// Smoke test: PDF with simple text extraction
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type = Some(r#"application/pdf"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 50,
|
||||
"expected length >= 50, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"To Whom it May Concern"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_txt_basic() {
|
||||
// Smoke test: Plain text file
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "text/report.txt");
|
||||
let mime_type = Some(r#"text/plain"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/plain"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 5,
|
||||
"expected length >= 5, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_xlsx_basic() {
|
||||
// Smoke test: XLSX with basic spreadsheet data including tables
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"xlsx/stanley_cups.xlsx"
|
||||
);
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 100,
|
||||
"expected length >= 100, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Team"#),
|
||||
"expected to contain: {}",
|
||||
r#"Team"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Location"#),
|
||||
"expected to contain: {}",
|
||||
r#"Location"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Stanley Cups"#),
|
||||
"expected to contain: {}",
|
||||
r#"Stanley Cups"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Blues"#),
|
||||
"expected to contain: {}",
|
||||
r#"Blues"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Flyers"#),
|
||||
"expected to contain: {}",
|
||||
r#"Flyers"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Maple Leafs"#),
|
||||
"expected to contain: {}",
|
||||
r#"Maple Leafs"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"STL"#),
|
||||
"expected to contain: {}",
|
||||
r#"STL"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"PHI"#),
|
||||
"expected to contain: {}",
|
||||
r#"PHI"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"TOR"#),
|
||||
"expected to contain: {}",
|
||||
r#"TOR"#
|
||||
);
|
||||
// skipped: field 'tables' not available on result type
|
||||
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
|
||||
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
||||
}
|
||||
Reference in New Issue
Block a user