Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

272
e2e/rust/tests/smoke_test.rs generated Normal file
View File

@@ -0,0 +1,272 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: smoke
use kreuzberg::{extract_bytes, extract_file};
#[tokio::test]
async fn test_ocr_image_png() {
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/images/test_hello_world.png"
))
.expect("test_documents/images/test_hello_world.png must exist");
let mime_type = r#"image/png"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes(&content, mime_type, &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"image/png"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 1,
"expected length >= 1, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Hello"#)
|| result.content.contains(r#"World"#)
|| result.content.contains(r#"hello"#)
|| result.content.contains(r#"world"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_docx_basic() {
// Smoke test: DOCX with formatted text
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 20,
"expected length >= 20, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Lorem"#)
|| result.content.contains(r#"ipsum"#)
|| result.content.contains(r#"document"#)
|| result.content.contains(r#"text"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_html_basic() {
// Smoke test: HTML table extraction
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"html/simple_table.html"
);
let mime_type = Some(r#"text/html"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/html"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Sample Data Table"#)
|| result.content.contains(r#"Laptop"#)
|| result.content.contains(r#"Electronics"#)
|| result.content.contains(r#"Product"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_image_png() {
// Smoke test: PNG image (without OCR, metadata only)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"images/sample.png"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"image/png"#,
"equals assertion failed"
);
}
#[tokio::test]
async fn test_smoke_json_basic() {
// Smoke test: JSON file extraction
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "json/simple.json");
let mime_type = Some(r#"application/json"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/json"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[tokio::test]
async fn test_smoke_pdf_basic() {
// Smoke test: PDF with simple text extraction
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type = Some(r#"application/pdf"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 50,
"expected length >= 50, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"To Whom it May Concern"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_txt_basic() {
// Smoke test: Plain text file
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "text/report.txt");
let mime_type = Some(r#"text/plain"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/plain"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[tokio::test]
async fn test_smoke_xlsx_basic() {
// Smoke test: XLSX with basic spreadsheet data including tables
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"xlsx/stanley_cups.xlsx"
);
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 100,
"expected length >= 100, got {}",
result.content.len()
);
assert!(
format!("{:?}", result.content).contains(r#"Team"#),
"expected to contain: {}",
r#"Team"#
);
assert!(
format!("{:?}", result.content).contains(r#"Location"#),
"expected to contain: {}",
r#"Location"#
);
assert!(
format!("{:?}", result.content).contains(r#"Stanley Cups"#),
"expected to contain: {}",
r#"Stanley Cups"#
);
assert!(
format!("{:?}", result.content).contains(r#"Blues"#),
"expected to contain: {}",
r#"Blues"#
);
assert!(
format!("{:?}", result.content).contains(r#"Flyers"#),
"expected to contain: {}",
r#"Flyers"#
);
assert!(
format!("{:?}", result.content).contains(r#"Maple Leafs"#),
"expected to contain: {}",
r#"Maple Leafs"#
);
assert!(
format!("{:?}", result.content).contains(r#"STL"#),
"expected to contain: {}",
r#"STL"#
);
assert!(
format!("{:?}", result.content).contains(r#"PHI"#),
"expected to contain: {}",
r#"PHI"#
);
assert!(
format!("{:?}", result.content).contains(r#"TOR"#),
"expected to contain: {}",
r#"TOR"#
);
// skipped: field 'tables' not available on result type
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
}