This commit is contained in:
7184
e2e/rust/Cargo.lock
generated
Normal file
7184
e2e/rust/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
24
e2e/rust/Cargo.toml
generated
Normal file
24
e2e/rust/Cargo.toml
generated
Normal file
@@ -0,0 +1,24 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
[workspace]
|
||||
|
||||
[package]
|
||||
name = "kreuzberg-e2e-rust"
|
||||
version = "5.0.0-rc.3"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
publish = false
|
||||
|
||||
[dependencies]
|
||||
kreuzberg = { path = "../../crates/kreuzberg", default-features = false, features = ["full", "pdf", "ocr", "paddle-ocr", "paddle-ocr-types", "layout-detection", "layout-types", "embeddings", "embedding-presets", "chunking", "keywords-yake", "keywords-rake", "language-detection", "html", "tree-sitter", "office", "email", "archives", "stopwords", "auto-rotate", "auto-rotate-types", "tokio-runtime", "api", "mcp", "liter-llm", "quality"] }
|
||||
serde_json = "1"
|
||||
anyhow = "1"
|
||||
async-trait = "0.1"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["serde_json", "anyhow", "async-trait"]
|
||||
63
e2e/rust/tests/async_test.rs
generated
Normal file
63
e2e/rust/tests/async_test.rs
generated
Normal file
@@ -0,0 +1,63 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: async
|
||||
|
||||
use kreuzberg::extract_bytes;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_async_extract_bytes() {
|
||||
// Async extract_bytes call on PDF document
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let mime_type = r#"application/pdf"#;
|
||||
let config = Default::default();
|
||||
let result = extract_bytes(&content, mime_type, &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 50,
|
||||
"expected length >= 50, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_async_extract_bytes_empty_mime() {
|
||||
// extract_bytes empty MIME async
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/plain.txt"
|
||||
))
|
||||
.expect("test_documents/text/plain.txt must exist");
|
||||
let mime_type = r#""#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes(&content, mime_type, &config).await;
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_async_extract_bytes_invalid_mime() {
|
||||
// extract_bytes unsupported MIME async
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/plain.txt"
|
||||
))
|
||||
.expect("test_documents/text/plain.txt must exist");
|
||||
let mime_type = r#"application/x-nonexistent"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes(&content, mime_type, &config).await;
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
111
e2e/rust/tests/batch_test.rs
generated
Normal file
111
e2e/rust/tests/batch_test.rs
generated
Normal file
@@ -0,0 +1,111 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: batch
|
||||
|
||||
use kreuzberg::BatchBytesItem;
|
||||
use kreuzberg::BatchFileItem;
|
||||
use kreuzberg::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_files, batch_extract_files_sync};
|
||||
|
||||
#[test]
|
||||
fn test_batch_bytes_invalid_mime() {
|
||||
// batch_extract_bytes_sync invalid MIME
|
||||
let items_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"content":[72,101,108,108,111],"mime_type":"application/x-nonexistent"}]"#).unwrap();
|
||||
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_bytes_sync(items, &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_extract_bytes_happy() {
|
||||
// batch_extract_bytes: happy path with mixed inputs
|
||||
let items_json: serde_json::Value = serde_json::from_str(r#"[{"content":[72,101,108,108,111,44,32,119,111,114,108,100,33],"mime_type":"text/plain"},{"content":[60,104,116,109,108,62,60,98,111,100,121,62,84,101,115,116,60,47,98,111,100,121,62,60,47,104,116,109,108,62],"mime_type":"text/html"}]"#).unwrap();
|
||||
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
|
||||
let config = Default::default();
|
||||
let result = batch_extract_bytes(items, &config).await.expect("should succeed");
|
||||
assert!(!result.is_empty(), "expected >= 1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_extract_bytes_mixed_format() {
|
||||
// batch_extract_bytes: handles unsupported MIME gracefully
|
||||
let items_json: serde_json::Value = serde_json::from_str(
|
||||
r#"[{"content":[80,68,70,32,112,108,97,99,101,104,111,108,100,101,114],"mime_type":"application/x-unknown"}]"#,
|
||||
)
|
||||
.unwrap();
|
||||
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_bytes(items, &config).await.expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_extract_bytes_sync_empty_list() {
|
||||
// batch_extract_bytes_sync: empty batch
|
||||
let items_json: serde_json::Value = serde_json::from_str(r#"[]"#).unwrap();
|
||||
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
|
||||
let config = Default::default();
|
||||
let result = batch_extract_bytes_sync(items, &config).expect("should succeed");
|
||||
assert_eq!(result.len(), 0, "expected exactly 0 elements, got {}", result.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_extract_bytes_sync_invalid_mime() {
|
||||
// batch_extract_bytes_sync: unsupported MIME
|
||||
let items_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"content":[100,97,116,97],"mime_type":"application/x-unknown"}]"#).unwrap();
|
||||
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_bytes_sync(items, &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_file_async_basic() {
|
||||
// Extract text from multiple files asynchronously
|
||||
let paths_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"path":"pdf/fake_memo.pdf"},{"path":"text/fake_text.txt"}]"#).unwrap();
|
||||
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_files(paths, &config).await.expect("should succeed");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_file_async_not_found() {
|
||||
// batch_extract_file async nonexistent
|
||||
let paths_json: serde_json::Value = serde_json::from_str(r#"[{"path":"/nonexistent/a.pdf"}]"#).unwrap();
|
||||
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_files(paths, &config).await.expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_file_not_found() {
|
||||
// batch_extract_file_sync nonexistent
|
||||
let paths_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"path":"/nonexistent/a.pdf"},{"path":"/nonexistent/b.txt"}]"#).unwrap();
|
||||
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_file_partial() {
|
||||
// batch_extract_file_sync mixed
|
||||
let paths_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"path":"text/plain.txt"},{"path":"/nonexistent/missing.pdf"}]"#).unwrap();
|
||||
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_file_sync_basic() {
|
||||
// Extract text from multiple files synchronously
|
||||
let paths_json: serde_json::Value =
|
||||
serde_json::from_str(r#"[{"path":"pdf/fake_memo.pdf"},{"path":"text/fake_text.txt"}]"#).unwrap();
|
||||
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
|
||||
let config = Default::default();
|
||||
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
|
||||
}
|
||||
37
e2e/rust/tests/code_test.rs
generated
Normal file
37
e2e/rust/tests/code_test.rs
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: code
|
||||
|
||||
use kreuzberg::extract_file_sync;
|
||||
|
||||
#[test]
|
||||
fn test_code_shebang_detection() {
|
||||
// Test language detection from shebang line via bytes input
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/script.sh");
|
||||
let mime_type = Some(r#"text/x-source-code"#);
|
||||
let config = Default::default();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/x-source-code"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"build"#),
|
||||
"expected to contain: {}",
|
||||
r#"build"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"clean"#),
|
||||
"expected to contain: {}",
|
||||
r#"clean"#
|
||||
);
|
||||
}
|
||||
496
e2e/rust/tests/contract_test.rs
generated
Normal file
496
e2e/rust/tests/contract_test.rs
generated
Normal file
@@ -0,0 +1,496 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: contract
|
||||
|
||||
use kreuzberg::{extract_bytes_sync, extract_file, extract_file_sync};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_batch_bytes_async() {
|
||||
// Tests async batch bytes extraction API (batch_extract_bytes)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config = Default::default();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_batch_bytes_with_configs_async() {
|
||||
// Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
let _metadata_output_format = result
|
||||
.metadata
|
||||
.output_format
|
||||
.as_ref()
|
||||
.map(|v| v.to_string())
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_batch_file_async() {
|
||||
// Tests async batch file extraction API (batch_extract_file)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config = Default::default();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_batch_file_with_configs_async() {
|
||||
// Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
let _metadata_output_format = result
|
||||
.metadata
|
||||
.output_format
|
||||
.as_ref()
|
||||
.map(|v| v.to_string())
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_extract_bytes_async() {
|
||||
// Tests async bytes extraction API (extract_bytes)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config = Default::default();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_extract_file_async() {
|
||||
// Tests async file extraction API (extract_file)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config = Default::default();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_chunking_prepend_heading_context() {
|
||||
// Tests markdown chunker prepends heading hierarchy to chunk content
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"markdown/extraction_test.md"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(
|
||||
r#"{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
let chunks = &result.chunks;
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
chunks.as_ref().map_or(0, |v| v.len()) >= 2 as usize,
|
||||
"expected >= 2 chunks"
|
||||
);
|
||||
assert!(
|
||||
result
|
||||
.chunks
|
||||
.as_ref()
|
||||
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
|
||||
"expected all chunks to have content"
|
||||
);
|
||||
assert!(
|
||||
result
|
||||
.chunks
|
||||
.as_ref()
|
||||
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
|
||||
"expected chunks with heading context"
|
||||
);
|
||||
assert!(
|
||||
result.chunks.as_ref().is_some_and(|chunks| chunks
|
||||
.first()
|
||||
.map_or(false, |c| c.content.trim_start().starts_with('#'))),
|
||||
"expected first chunk to start with heading"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_document_structure_with_headings() {
|
||||
// Tests document structure with DOCX heading-driven nesting
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"include_document_structure":true}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
// skipped: field 'document' not available on result type
|
||||
// skipped: field 'document.nodes' not available on result type
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_element_types() {
|
||||
// Tests element-based result format with element type assertions on DOCX
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"docx/unit_test_headers.docx"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"result_format":"element_based"}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert!(
|
||||
result
|
||||
.mime_type
|
||||
.contains(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
// skipped: field 'elements' not available on result type
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_extraction_timeout() {
|
||||
// Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"extraction_timeout_secs":300}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_keywords() {
|
||||
// Tests keyword extraction via YAKE algorithm
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value =
|
||||
serde_json::from_str(r#"{"keywords":{"algorithm":"yake","max_keywords":10}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
|
||||
"expected keywords to be present and non-empty"
|
||||
);
|
||||
assert!(
|
||||
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
|
||||
"expected >= 1"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_pages() {
|
||||
// Tests page extraction and page marker configuration
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value =
|
||||
serde_json::from_str(r#"{"pages":{"extract_pages":true,"insert_page_markers":true}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"PAGE"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_quality_enabled() {
|
||||
// Tests quality scoring produces a score value in [0.0, 1.0]
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"enable_quality_processing":true}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_security_limits() {
|
||||
// Tests archive extraction with custom security limits
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"archives/documents.zip"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(
|
||||
r#"{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert!(
|
||||
result.mime_type.contains(r#"application/zip"#) || result.mime_type.contains(r#"application/x-zip-compressed"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_tree_sitter() {
|
||||
// Tests tree-sitter configuration round-trip
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/hello.py");
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/x-source-code"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 5,
|
||||
"expected length >= 5, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_bytes_markdown() {
|
||||
// Tests markdown output format via bytes extraction API
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let mime_type = r#"application/pdf"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
|
||||
let _metadata_output_format = result
|
||||
.metadata
|
||||
.output_format
|
||||
.as_ref()
|
||||
.map(|v| v.to_string())
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_markdown() {
|
||||
// Tests Markdown output format
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
let _metadata_output_format = result
|
||||
.metadata
|
||||
.output_format
|
||||
.as_ref()
|
||||
.map(|v| v.to_string())
|
||||
.unwrap_or_default();
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
49
e2e/rust/tests/detection_test.rs
generated
Normal file
49
e2e/rust/tests/detection_test.rs
generated
Normal file
@@ -0,0 +1,49 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: detection
|
||||
|
||||
use kreuzberg::{detect_mime_type_from_bytes, get_extensions_for_mime};
|
||||
|
||||
#[test]
|
||||
fn test_detect_mime_bytes_html() {
|
||||
// Detect HTML MIME from bytes
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/html/html.html"
|
||||
))
|
||||
.expect("test_documents/html/html.html must exist");
|
||||
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mime_bytes_pdf() {
|
||||
// Detect PDF MIME type from bytes
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mime_bytes_png() {
|
||||
// Detect PNG MIME type from bytes
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/images/test_hello_world.png"
|
||||
))
|
||||
.expect("test_documents/images/test_hello_world.png must exist");
|
||||
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_extensions_unknown_mime() {
|
||||
// get_extensions unknown MIME
|
||||
let mime_type = r#"application/x-totally-unknown"#;
|
||||
let result = get_extensions_for_mime(mime_type);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
20
e2e/rust/tests/document_extractor_management_test.rs
generated
Normal file
20
e2e/rust/tests/document_extractor_management_test.rs
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: document_extractor_management
|
||||
|
||||
use kreuzberg::{clear_document_extractors, list_document_extractors};
|
||||
|
||||
#[test]
|
||||
fn test_document_extractors_clear() {
|
||||
// Clear all document extractors and verify list is empty
|
||||
let _ = clear_document_extractors();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extractors_list() {
|
||||
// List all registered document extractors
|
||||
let _ = list_document_extractors();
|
||||
}
|
||||
39
e2e/rust/tests/embed_async_pending_test.rs
generated
Normal file
39
e2e/rust/tests/embed_async_pending_test.rs
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: embed_async_pending
|
||||
|
||||
use kreuzberg::embed_texts_async;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_embed_texts_async_empty_input() {
|
||||
// embed_texts_async: empty text list
|
||||
let texts_json: serde_json::Value = serde_json::from_str(r#"[]"#).unwrap();
|
||||
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
|
||||
let config = Default::default();
|
||||
let result = embed_texts_async(texts, &config).await.expect("should succeed");
|
||||
assert_eq!(result.len(), 0, "expected exactly 0 elements, got {}", result.len());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_embed_texts_async_happy() {
|
||||
// embed_texts_async: basic async embedding
|
||||
let texts_json: serde_json::Value = serde_json::from_str(r#"["First","Second"]"#).unwrap();
|
||||
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
|
||||
let config = Default::default();
|
||||
let result = embed_texts_async(texts, &config).await.expect("should succeed");
|
||||
assert!(result.len() >= 2, "expected at least 2 elements, got {}", result.len());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_embed_texts_async_preset_switch() {
|
||||
// embed_texts_async: preset override
|
||||
let texts_json: serde_json::Value = serde_json::from_str(r#"["Text"]"#).unwrap();
|
||||
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
|
||||
let config_json: serde_json::Value =
|
||||
serde_json::from_str(r#"{"model":{"name":"balanced","type":"preset"}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let _ = embed_texts_async(texts, &config).await.expect("should succeed");
|
||||
}
|
||||
19
e2e/rust/tests/embed_extra_test.rs
generated
Normal file
19
e2e/rust/tests/embed_extra_test.rs
generated
Normal file
@@ -0,0 +1,19 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: embed_extra
|
||||
|
||||
use kreuzberg::embed_texts;
|
||||
|
||||
#[test]
|
||||
fn test_embed_texts_batch() {
|
||||
// Batch embed texts
|
||||
let texts_json: serde_json::Value = serde_json::from_str(r#"["Hello","World"]"#).unwrap();
|
||||
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
|
||||
let config_json: serde_json::Value =
|
||||
serde_json::from_str(r#"{"model":{"name":"balanced","type":"preset"}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let _ = embed_texts(texts, &config).expect("should succeed");
|
||||
}
|
||||
20
e2e/rust/tests/embedding_backend_management_test.rs
generated
Normal file
20
e2e/rust/tests/embedding_backend_management_test.rs
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: embedding_backend_management
|
||||
|
||||
use kreuzberg::{clear_embedding_backends, list_embedding_backends};
|
||||
|
||||
#[test]
|
||||
fn test_embedding_backends_clear() {
|
||||
// Clear all embedding backends and verify list is empty
|
||||
let _ = clear_embedding_backends();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embedding_backends_list() {
|
||||
// List all registered embedding backends
|
||||
let _ = list_embedding_backends();
|
||||
}
|
||||
49
e2e/rust/tests/embeddings_test.rs
generated
Normal file
49
e2e/rust/tests/embeddings_test.rs
generated
Normal file
@@ -0,0 +1,49 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: embeddings
|
||||
|
||||
use kreuzberg::{embed_texts, get_embedding_preset, list_embedding_presets};
|
||||
|
||||
#[test]
|
||||
fn test_embed_texts_different_preset() {
|
||||
// embed_texts: multilingual preset
|
||||
let texts_json: serde_json::Value = serde_json::from_str(r#"["Hello world","Test"]"#).unwrap();
|
||||
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
|
||||
let config_json: serde_json::Value =
|
||||
serde_json::from_str(r#"{"model":{"name":"multilingual","type":"preset"}}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = embed_texts(texts, &config).expect("should succeed");
|
||||
assert!(result.len() >= 2, "expected at least 2 elements, got {}", result.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_embedding_preset_known() {
|
||||
// get_embedding_preset: known preset
|
||||
let name = r#"balanced"#;
|
||||
let _ = get_embedding_preset(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_embedding_preset_nominal() {
|
||||
// get_embedding_preset: nominal case
|
||||
let name = r#"balanced"#;
|
||||
let _ = get_embedding_preset(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_embedding_preset_unknown() {
|
||||
// get_embedding_preset: unknown preset fails
|
||||
let name = r#"nonexistent-xyz"#;
|
||||
let result = get_embedding_preset(name);
|
||||
assert!(result.is_none(), "expected Option to be is_none");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_embedding_presets_sanity() {
|
||||
// list_embedding_presets: returns at least one
|
||||
let result = list_embedding_presets();
|
||||
assert!(!result.is_empty(), "expected non-empty value");
|
||||
}
|
||||
82
e2e/rust/tests/error_test.rs
generated
Normal file
82
e2e/rust/tests/error_test.rs
generated
Normal file
@@ -0,0 +1,82 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: error
|
||||
|
||||
use kreuzberg::extract_bytes_sync;
|
||||
|
||||
#[test]
|
||||
fn test_error_empty_bytes() {
|
||||
// Graceful handling of empty bytes (should not error)
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/empty.txt"
|
||||
))
|
||||
.expect("test_documents/text/empty.txt must exist");
|
||||
let mime_type = r#"text/plain"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let _ = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_empty_mime() {
|
||||
// Error when extracting with empty MIME type
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/plain.txt"
|
||||
))
|
||||
.expect("test_documents/text/plain.txt must exist");
|
||||
let mime_type = r#""#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_extract_bytes_conflicting_ocr() {
|
||||
// extract_bytes force+disable OCR
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/fake_text.txt"
|
||||
))
|
||||
.expect("test_documents/text/fake_text.txt must exist");
|
||||
let mime_type = r#"text/plain"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true,"force_ocr":true}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_invalid_mime_format() {
|
||||
// Error when extracting with invalid MIME type format
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/plain.txt"
|
||||
))
|
||||
.expect("test_documents/text/plain.txt must exist");
|
||||
let mime_type = r#"not-a-mime"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_unsupported_mime() {
|
||||
// Error when extracting with unsupported MIME type
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/text/plain.txt"
|
||||
))
|
||||
.expect("test_documents/text/plain.txt must exist");
|
||||
let mime_type = r#"application/x-nonexistent"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
93
e2e/rust/tests/format_specific_test.rs
generated
Normal file
93
e2e/rust/tests/format_specific_test.rs
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: format_specific
|
||||
|
||||
use kreuzberg::{extract_bytes_sync, extract_file_sync};
|
||||
|
||||
#[test]
|
||||
fn test_format_docx_standalone() {
|
||||
// Standalone DOCX extraction using extract_bytes_sync
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/docx/fake.docx"
|
||||
))
|
||||
.expect("test_documents/docx/fake.docx must exist");
|
||||
let mime_type = r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#;
|
||||
let config = Default::default();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
|
||||
assert!(
|
||||
result.content.len() >= 20,
|
||||
"expected length >= 20, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_hwpx_standalone() {
|
||||
// Standalone HWPX extraction using extract_bytes_sync
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/hwpx/simple.hwpx"
|
||||
))
|
||||
.expect("test_documents/hwpx/simple.hwpx must exist");
|
||||
let mime_type = r#"application/haansofthwpx"#;
|
||||
let config = Default::default();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
|
||||
assert!(
|
||||
result.content.len() >= 20,
|
||||
"expected length >= 20, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Hello from HWPX"#),
|
||||
"expected to contain: {}",
|
||||
r#"Hello from HWPX"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_pdf_text() {
|
||||
// Standalone PDF text extraction using extract_bytes_sync
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let mime_type = r#"application/pdf"#;
|
||||
let config = Default::default();
|
||||
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
|
||||
assert!(
|
||||
result.content.len() >= 50,
|
||||
"expected length >= 50, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Mallori"#) || result.content.contains(r#"May"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_pptx() {
|
||||
// PPTX presentation extraction using extract_file_sync
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pptx/simple.pptx");
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.presentationml.presentation"#);
|
||||
let config = Default::default();
|
||||
let _ = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_xlsx() {
|
||||
// XLSX spreadsheet extraction using extract_file_sync
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"xlsx/stanley_cups.xlsx"
|
||||
);
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
|
||||
let config = Default::default();
|
||||
let _ = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
|
||||
}
|
||||
52
e2e/rust/tests/mime_utilities_test.rs
generated
Normal file
52
e2e/rust/tests/mime_utilities_test.rs
generated
Normal file
@@ -0,0 +1,52 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: mime_utilities
|
||||
|
||||
use kreuzberg::{detect_mime_type_from_bytes, get_extensions_for_mime};
|
||||
|
||||
#[test]
|
||||
fn test_mime_detect_bytes() {
|
||||
// Detect MIME type from file bytes
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let result = detect_mime_type_from_bytes(&content).expect("should succeed");
|
||||
assert!(
|
||||
format!("{:?}", result).contains(r#"pdf"#),
|
||||
"expected to contain: {}",
|
||||
r#"pdf"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mime_detect_image() {
|
||||
// Detect MIME type from PNG image bytes
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/images/test_hello_world.png"
|
||||
))
|
||||
.expect("test_documents/images/test_hello_world.png must exist");
|
||||
let result = detect_mime_type_from_bytes(&content).expect("should succeed");
|
||||
assert!(
|
||||
format!("{:?}", result).contains(r#"png"#),
|
||||
"expected to contain: {}",
|
||||
r#"png"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mime_get_extensions() {
|
||||
// Get file extensions for a MIME type
|
||||
let mime_type = r#"application/pdf"#;
|
||||
let result = get_extensions_for_mime(mime_type).expect("should succeed");
|
||||
assert!(
|
||||
format!("{:?}", result).contains(r#"pdf"#),
|
||||
"expected to contain: {}",
|
||||
r#"pdf"#
|
||||
);
|
||||
}
|
||||
27
e2e/rust/tests/ocr_backend_management_test.rs
generated
Normal file
27
e2e/rust/tests/ocr_backend_management_test.rs
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: ocr_backend_management
|
||||
|
||||
use kreuzberg::{clear_ocr_backends, list_ocr_backends, unregister_ocr_backend};
|
||||
|
||||
#[test]
|
||||
fn test_ocr_backends_clear() {
|
||||
// Clear all OCR backends and verify list is empty
|
||||
let _ = clear_ocr_backends();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_backends_list() {
|
||||
// List all registered OCR backends
|
||||
let _ = list_ocr_backends();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ocr_backends_unregister() {
|
||||
// Unregister nonexistent OCR backend gracefully
|
||||
let name = r#"nonexistent-backend-xyz"#;
|
||||
let _ = unregister_ocr_backend(name);
|
||||
}
|
||||
34
e2e/rust/tests/pdf_test.rs
generated
Normal file
34
e2e/rust/tests/pdf_test.rs
generated
Normal file
@@ -0,0 +1,34 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: pdf
|
||||
|
||||
use kreuzberg::render_pdf_page_to_png;
|
||||
|
||||
#[test]
|
||||
fn test_render_pdf_page_first() {
|
||||
// render_pdf_page_to_png: first page
|
||||
let pdf_bytes = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let page_index = 0;
|
||||
let result = render_pdf_page_to_png(&pdf_bytes, page_index, None, None).expect("should succeed");
|
||||
assert!(result.len() >= 100, "expected length >= 100, got {}", result.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_render_pdf_page_out_of_range() {
|
||||
// render_pdf_page_to_png: page out of range
|
||||
let pdf_bytes = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/pdf/fake_memo.pdf"
|
||||
))
|
||||
.expect("test_documents/pdf/fake_memo.pdf must exist");
|
||||
let page_index = 999;
|
||||
let result = render_pdf_page_to_png(&pdf_bytes, page_index, None, None);
|
||||
assert!(result.is_err(), "expected call to fail");
|
||||
}
|
||||
232
e2e/rust/tests/plugin_api_test.rs
generated
Normal file
232
e2e/rust/tests/plugin_api_test.rs
generated
Normal file
@@ -0,0 +1,232 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: plugin_api
|
||||
|
||||
use kreuzberg::{
|
||||
register_document_extractor, register_embedding_backend, register_ocr_backend, register_post_processor,
|
||||
register_renderer, register_validator, unregister_document_extractor, unregister_embedding_backend,
|
||||
unregister_post_processor, unregister_renderer, unregister_validator,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_register_document_extractor_trait_bridge() {
|
||||
// register_document_extractor: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::DocumentExtractor;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionConfig;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::InternalDocument;
|
||||
struct TestStubRegisterDocumentExtractorTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterDocumentExtractorTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl DocumentExtractor for TestStubRegisterDocumentExtractorTraitBridge {
|
||||
async fn extract_bytes(
|
||||
&self,
|
||||
_p0: &[u8],
|
||||
_p1: &str,
|
||||
_p2: &ExtractionConfig,
|
||||
) -> kreuzberg::Result<InternalDocument> {
|
||||
Ok(InternalDocument::default())
|
||||
}
|
||||
fn supported_mime_types(&self) -> &[&str] {
|
||||
&[]
|
||||
}
|
||||
}
|
||||
let _ = register_document_extractor(std::sync::Arc::new(TestStubRegisterDocumentExtractorTraitBridge {
|
||||
_name: "test-extractor",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_register_embedding_backend_trait_bridge() {
|
||||
// register_embedding_backend: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::EmbeddingBackend;
|
||||
struct TestStubRegisterEmbeddingBackendTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterEmbeddingBackendTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl EmbeddingBackend for TestStubRegisterEmbeddingBackendTraitBridge {
|
||||
fn dimensions(&self) -> usize {
|
||||
0
|
||||
}
|
||||
async fn embed(&self, _p0: Vec<String>) -> kreuzberg::Result<Vec<Vec<f32>>> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
let _ = register_embedding_backend(std::sync::Arc::new(TestStubRegisterEmbeddingBackendTraitBridge {
|
||||
_name: "test-embedding-backend",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_register_ocr_backend_trait_bridge() {
|
||||
// register_ocr_backend: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionResult;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::OcrBackend;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::OcrBackendType;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::OcrConfig;
|
||||
struct TestStubRegisterOcrBackendTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterOcrBackendTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl OcrBackend for TestStubRegisterOcrBackendTraitBridge {
|
||||
async fn process_image(&self, _p0: &[u8], _p1: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
|
||||
Ok(ExtractionResult::default())
|
||||
}
|
||||
fn supports_language(&self, _p0: &str) -> bool {
|
||||
false
|
||||
}
|
||||
fn backend_type(&self) -> OcrBackendType {
|
||||
OcrBackendType::default()
|
||||
}
|
||||
}
|
||||
let _ = register_ocr_backend(std::sync::Arc::new(TestStubRegisterOcrBackendTraitBridge {
|
||||
_name: "test-backend",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_register_post_processor_trait_bridge() {
|
||||
// register_post_processor: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionConfig;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionResult;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::PostProcessor;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ProcessingStage;
|
||||
struct TestStubRegisterPostProcessorTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterPostProcessorTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl PostProcessor for TestStubRegisterPostProcessorTraitBridge {
|
||||
async fn process(&self, _p0: &mut ExtractionResult, _p1: &ExtractionConfig) -> kreuzberg::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn processing_stage(&self) -> ProcessingStage {
|
||||
ProcessingStage::default()
|
||||
}
|
||||
}
|
||||
let _ = register_post_processor(std::sync::Arc::new(TestStubRegisterPostProcessorTraitBridge {
|
||||
_name: "test-processor",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_register_renderer_trait_bridge() {
|
||||
// register_renderer: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::InternalDocument;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::Renderer;
|
||||
struct TestStubRegisterRendererTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterRendererTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
impl Renderer for TestStubRegisterRendererTraitBridge {
|
||||
fn render(&self, _p0: &InternalDocument) -> kreuzberg::Result<String> {
|
||||
Ok(String::new())
|
||||
}
|
||||
}
|
||||
let _ = register_renderer(std::sync::Arc::new(TestStubRegisterRendererTraitBridge {
|
||||
_name: "test-renderer",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_register_validator_trait_bridge() {
|
||||
// register_validator: trait bridge
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionConfig;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::ExtractionResult;
|
||||
#[allow(unused_imports)]
|
||||
use kreuzberg::Validator;
|
||||
struct TestStubRegisterValidatorTraitBridge {
|
||||
_name: &'static str,
|
||||
}
|
||||
impl kreuzberg::plugins::Plugin for TestStubRegisterValidatorTraitBridge {
|
||||
fn name(&self) -> &str {
|
||||
self._name
|
||||
}
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl Validator for TestStubRegisterValidatorTraitBridge {
|
||||
async fn validate(&self, _p0: &ExtractionResult, _p1: &ExtractionConfig) -> kreuzberg::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
let _ = register_validator(std::sync::Arc::new(TestStubRegisterValidatorTraitBridge {
|
||||
_name: "test-validator",
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unregister_document_extractor_after_register() {
|
||||
// unregister_document_extractor
|
||||
let name = r#"test-extractor"#;
|
||||
let _ = unregister_document_extractor(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unregister_embedding_backend_after_register() {
|
||||
// unregister_embedding_backend
|
||||
let name = r#"test-embedding-backend"#;
|
||||
let _ = unregister_embedding_backend(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unregister_post_processor_after_register() {
|
||||
// unregister_post_processor
|
||||
let name = r#"test-processor"#;
|
||||
let _ = unregister_post_processor(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unregister_renderer_after_register() {
|
||||
// unregister_renderer
|
||||
let name = r#"test-renderer"#;
|
||||
let _ = unregister_renderer(name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unregister_validator_after_register() {
|
||||
// unregister_validator
|
||||
let name = r#"test-validator"#;
|
||||
let _ = unregister_validator(name);
|
||||
}
|
||||
20
e2e/rust/tests/post_processor_management_test.rs
generated
Normal file
20
e2e/rust/tests/post_processor_management_test.rs
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: post_processor_management
|
||||
|
||||
use kreuzberg::{clear_post_processors, list_post_processors};
|
||||
|
||||
#[test]
|
||||
fn test_post_processors_clear() {
|
||||
// Clear all post-processors and verify list is empty
|
||||
let _ = clear_post_processors();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_post_processors_list() {
|
||||
// List all registered post-processors
|
||||
let _ = list_post_processors();
|
||||
}
|
||||
29
e2e/rust/tests/registry_operations_test.rs
generated
Normal file
29
e2e/rust/tests/registry_operations_test.rs
generated
Normal file
@@ -0,0 +1,29 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: registry_operations
|
||||
|
||||
use kreuzberg::get_extensions_for_mime;
|
||||
|
||||
#[test]
|
||||
fn test_extensions_docx() {
|
||||
// Get file extensions for DOCX MIME type
|
||||
let mime_type = r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#;
|
||||
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extensions_html() {
|
||||
// Get file extensions for HTML MIME type
|
||||
let mime_type = r#"text/html"#;
|
||||
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extensions_pdf() {
|
||||
// Get file extensions for PDF MIME type
|
||||
let mime_type = r#"application/pdf"#;
|
||||
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
|
||||
}
|
||||
47
e2e/rust/tests/registry_test.rs
generated
Normal file
47
e2e/rust/tests/registry_test.rs
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: registry
|
||||
|
||||
use kreuzberg::{
|
||||
list_document_extractors, list_embedding_backends, list_ocr_backends, list_post_processors, list_renderers,
|
||||
list_validators,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_list_document_extractors() {
|
||||
// List document extractors
|
||||
let _ = list_document_extractors();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_embedding_backends() {
|
||||
// List embedding backends
|
||||
let _ = list_embedding_backends();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_ocr_backends() {
|
||||
// List OCR backends
|
||||
let _ = list_ocr_backends();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_post_processors() {
|
||||
// List post-processors
|
||||
let _ = list_post_processors();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_renderers() {
|
||||
// List renderers
|
||||
let _ = list_renderers();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_validators() {
|
||||
// List validators
|
||||
let _ = list_validators();
|
||||
}
|
||||
20
e2e/rust/tests/renderer_management_test.rs
generated
Normal file
20
e2e/rust/tests/renderer_management_test.rs
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: renderer_management
|
||||
|
||||
use kreuzberg::{clear_renderers, list_renderers};
|
||||
|
||||
#[test]
|
||||
fn test_renderers_clear() {
|
||||
// Clear all renderers and verify list is empty
|
||||
let _ = clear_renderers();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_renderers_list() {
|
||||
// List all registered renderers
|
||||
let _ = list_renderers();
|
||||
}
|
||||
272
e2e/rust/tests/smoke_test.rs
generated
Normal file
272
e2e/rust/tests/smoke_test.rs
generated
Normal file
@@ -0,0 +1,272 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: smoke
|
||||
|
||||
use kreuzberg::{extract_bytes, extract_file};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ocr_image_png() {
|
||||
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
|
||||
let content = std::fs::read(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/images/test_hello_world.png"
|
||||
))
|
||||
.expect("test_documents/images/test_hello_world.png must exist");
|
||||
let mime_type = r#"image/png"#;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_bytes(&content, mime_type, &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"image/png"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 1,
|
||||
"expected length >= 1, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Hello"#)
|
||||
|| result.content.contains(r#"World"#)
|
||||
|| result.content.contains(r#"hello"#)
|
||||
|| result.content.contains(r#"world"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_docx_basic() {
|
||||
// Smoke test: DOCX with formatted text
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 20,
|
||||
"expected length >= 20, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Lorem"#)
|
||||
|| result.content.contains(r#"ipsum"#)
|
||||
|| result.content.contains(r#"document"#)
|
||||
|| result.content.contains(r#"text"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_html_basic() {
|
||||
// Smoke test: HTML table extraction
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"html/simple_table.html"
|
||||
);
|
||||
let mime_type = Some(r#"text/html"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/html"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 10,
|
||||
"expected length >= 10, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"Sample Data Table"#)
|
||||
|| result.content.contains(r#"Laptop"#)
|
||||
|| result.content.contains(r#"Electronics"#)
|
||||
|| result.content.contains(r#"Product"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_image_png() {
|
||||
// Smoke test: PNG image (without OCR, metadata only)
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"images/sample.png"
|
||||
);
|
||||
let mime_type: Option<String> = None;
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"image/png"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_json_basic() {
|
||||
// Smoke test: JSON file extraction
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "json/simple.json");
|
||||
let mime_type = Some(r#"application/json"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/json"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 5,
|
||||
"expected length >= 5, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_pdf_basic() {
|
||||
// Smoke test: PDF with simple text extraction
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"pdf/fake_memo.pdf"
|
||||
);
|
||||
let mime_type = Some(r#"application/pdf"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/pdf"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 50,
|
||||
"expected length >= 50, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"To Whom it May Concern"#),
|
||||
"expected to contain at least one of the specified values"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_txt_basic() {
|
||||
// Smoke test: Plain text file
|
||||
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "text/report.txt");
|
||||
let mime_type = Some(r#"text/plain"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"text/plain"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 5,
|
||||
"expected length >= 5, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_smoke_xlsx_basic() {
|
||||
// Smoke test: XLSX with basic spreadsheet data including tables
|
||||
let path: &str = concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/",
|
||||
"xlsx/stanley_cups.xlsx"
|
||||
);
|
||||
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
|
||||
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
|
||||
let config = serde_json::from_value(config_json).unwrap();
|
||||
let result = extract_file(path, mime_type.as_deref(), &config)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
assert_eq!(
|
||||
result.mime_type.to_string().as_str().trim(),
|
||||
r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#,
|
||||
"equals assertion failed"
|
||||
);
|
||||
assert!(
|
||||
result.content.len() >= 100,
|
||||
"expected length >= 100, got {}",
|
||||
result.content.len()
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Team"#),
|
||||
"expected to contain: {}",
|
||||
r#"Team"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Location"#),
|
||||
"expected to contain: {}",
|
||||
r#"Location"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Stanley Cups"#),
|
||||
"expected to contain: {}",
|
||||
r#"Stanley Cups"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Blues"#),
|
||||
"expected to contain: {}",
|
||||
r#"Blues"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Flyers"#),
|
||||
"expected to contain: {}",
|
||||
r#"Flyers"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"Maple Leafs"#),
|
||||
"expected to contain: {}",
|
||||
r#"Maple Leafs"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"STL"#),
|
||||
"expected to contain: {}",
|
||||
r#"STL"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"PHI"#),
|
||||
"expected to contain: {}",
|
||||
r#"PHI"#
|
||||
);
|
||||
assert!(
|
||||
format!("{:?}", result.content).contains(r#"TOR"#),
|
||||
"expected to contain: {}",
|
||||
r#"TOR"#
|
||||
);
|
||||
// skipped: field 'tables' not available on result type
|
||||
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
|
||||
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
||||
}
|
||||
20
e2e/rust/tests/validator_management_test.rs
generated
Normal file
20
e2e/rust/tests/validator_management_test.rs
generated
Normal file
@@ -0,0 +1,20 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
//! E2e tests for category: validator_management
|
||||
|
||||
use kreuzberg::{clear_validators, list_validators};
|
||||
|
||||
#[test]
|
||||
fn test_validators_clear() {
|
||||
// Clear all validators and verify list is empty
|
||||
let _ = clear_validators();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validators_list() {
|
||||
// List all registered validators
|
||||
let _ = list_validators();
|
||||
}
|
||||
Reference in New Issue
Block a user