Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

63
e2e/rust/tests/async_test.rs generated Normal file
View File

@@ -0,0 +1,63 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: async
use kreuzberg::extract_bytes;
#[tokio::test]
async fn test_async_extract_bytes() {
// Async extract_bytes call on PDF document
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let mime_type = r#"application/pdf"#;
let config = Default::default();
let result = extract_bytes(&content, mime_type, &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 50,
"expected length >= 50, got {}",
result.content.len()
);
}
#[tokio::test]
async fn test_async_extract_bytes_empty_mime() {
// extract_bytes empty MIME async
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/plain.txt"
))
.expect("test_documents/text/plain.txt must exist");
let mime_type = r#""#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes(&content, mime_type, &config).await;
assert!(result.is_err(), "expected call to fail");
}
#[tokio::test]
async fn test_async_extract_bytes_invalid_mime() {
// extract_bytes unsupported MIME async
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/plain.txt"
))
.expect("test_documents/text/plain.txt must exist");
let mime_type = r#"application/x-nonexistent"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes(&content, mime_type, &config).await;
assert!(result.is_err(), "expected call to fail");
}

111
e2e/rust/tests/batch_test.rs generated Normal file
View File

@@ -0,0 +1,111 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: batch
use kreuzberg::BatchBytesItem;
use kreuzberg::BatchFileItem;
use kreuzberg::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_files, batch_extract_files_sync};
#[test]
fn test_batch_bytes_invalid_mime() {
// batch_extract_bytes_sync invalid MIME
let items_json: serde_json::Value =
serde_json::from_str(r#"[{"content":[72,101,108,108,111],"mime_type":"application/x-nonexistent"}]"#).unwrap();
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
let config = Default::default();
let _ = batch_extract_bytes_sync(items, &config).expect("should succeed");
}
#[tokio::test]
async fn test_batch_extract_bytes_happy() {
// batch_extract_bytes: happy path with mixed inputs
let items_json: serde_json::Value = serde_json::from_str(r#"[{"content":[72,101,108,108,111,44,32,119,111,114,108,100,33],"mime_type":"text/plain"},{"content":[60,104,116,109,108,62,60,98,111,100,121,62,84,101,115,116,60,47,98,111,100,121,62,60,47,104,116,109,108,62],"mime_type":"text/html"}]"#).unwrap();
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
let config = Default::default();
let result = batch_extract_bytes(items, &config).await.expect("should succeed");
assert!(!result.is_empty(), "expected >= 1");
}
#[tokio::test]
async fn test_batch_extract_bytes_mixed_format() {
// batch_extract_bytes: handles unsupported MIME gracefully
let items_json: serde_json::Value = serde_json::from_str(
r#"[{"content":[80,68,70,32,112,108,97,99,101,104,111,108,100,101,114],"mime_type":"application/x-unknown"}]"#,
)
.unwrap();
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
let config = Default::default();
let _ = batch_extract_bytes(items, &config).await.expect("should succeed");
}
#[test]
fn test_batch_extract_bytes_sync_empty_list() {
// batch_extract_bytes_sync: empty batch
let items_json: serde_json::Value = serde_json::from_str(r#"[]"#).unwrap();
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
let config = Default::default();
let result = batch_extract_bytes_sync(items, &config).expect("should succeed");
assert_eq!(result.len(), 0, "expected exactly 0 elements, got {}", result.len());
}
#[test]
fn test_batch_extract_bytes_sync_invalid_mime() {
// batch_extract_bytes_sync: unsupported MIME
let items_json: serde_json::Value =
serde_json::from_str(r#"[{"content":[100,97,116,97],"mime_type":"application/x-unknown"}]"#).unwrap();
let items = serde_json::from_value::<Vec<BatchBytesItem>>(items_json).unwrap();
let config = Default::default();
let _ = batch_extract_bytes_sync(items, &config).expect("should succeed");
}
#[tokio::test]
async fn test_batch_file_async_basic() {
// Extract text from multiple files asynchronously
let paths_json: serde_json::Value =
serde_json::from_str(r#"[{"path":"pdf/fake_memo.pdf"},{"path":"text/fake_text.txt"}]"#).unwrap();
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
let config = Default::default();
let _ = batch_extract_files(paths, &config).await.expect("should succeed");
}
#[tokio::test]
async fn test_batch_file_async_not_found() {
// batch_extract_file async nonexistent
let paths_json: serde_json::Value = serde_json::from_str(r#"[{"path":"/nonexistent/a.pdf"}]"#).unwrap();
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
let config = Default::default();
let _ = batch_extract_files(paths, &config).await.expect("should succeed");
}
#[test]
fn test_batch_file_not_found() {
// batch_extract_file_sync nonexistent
let paths_json: serde_json::Value =
serde_json::from_str(r#"[{"path":"/nonexistent/a.pdf"},{"path":"/nonexistent/b.txt"}]"#).unwrap();
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
let config = Default::default();
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
}
#[test]
fn test_batch_file_partial() {
// batch_extract_file_sync mixed
let paths_json: serde_json::Value =
serde_json::from_str(r#"[{"path":"text/plain.txt"},{"path":"/nonexistent/missing.pdf"}]"#).unwrap();
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
let config = Default::default();
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
}
#[test]
fn test_batch_file_sync_basic() {
// Extract text from multiple files synchronously
let paths_json: serde_json::Value =
serde_json::from_str(r#"[{"path":"pdf/fake_memo.pdf"},{"path":"text/fake_text.txt"}]"#).unwrap();
let paths = serde_json::from_value::<Vec<BatchFileItem>>(paths_json).unwrap();
let config = Default::default();
let _ = batch_extract_files_sync(paths, &config).expect("should succeed");
}

37
e2e/rust/tests/code_test.rs generated Normal file
View File

@@ -0,0 +1,37 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: code
use kreuzberg::extract_file_sync;
#[test]
fn test_code_shebang_detection() {
// Test language detection from shebang line via bytes input
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/script.sh");
let mime_type = Some(r#"text/x-source-code"#);
let config = Default::default();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/x-source-code"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
format!("{:?}", result.content).contains(r#"build"#),
"expected to contain: {}",
r#"build"#
);
assert!(
format!("{:?}", result.content).contains(r#"clean"#),
"expected to contain: {}",
r#"clean"#
);
}

496
e2e/rust/tests/contract_test.rs generated Normal file
View File

@@ -0,0 +1,496 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: contract
use kreuzberg::{extract_bytes_sync, extract_file, extract_file_sync};
#[tokio::test]
async fn test_api_batch_bytes_async() {
// Tests async batch bytes extraction API (batch_extract_bytes)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_batch_bytes_with_configs_async() {
// Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[tokio::test]
async fn test_api_batch_file_async() {
// Tests async batch file extraction API (batch_extract_file)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_batch_file_with_configs_async() {
// Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[tokio::test]
async fn test_api_extract_bytes_async() {
// Tests async bytes extraction API (extract_bytes)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_extract_file_async() {
// Tests async file extraction API (extract_file)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[test]
fn test_config_chunking_prepend_heading_context() {
// Tests markdown chunker prepends heading hierarchy to chunk content
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"markdown/extraction_test.md"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(
r#"{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}"#,
)
.unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
let chunks = &result.chunks;
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
chunks.as_ref().map_or(0, |v| v.len()) >= 2 as usize,
"expected >= 2 chunks"
);
assert!(
result
.chunks
.as_ref()
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
"expected all chunks to have content"
);
assert!(
result
.chunks
.as_ref()
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
"expected chunks with heading context"
);
assert!(
result.chunks.as_ref().is_some_and(|chunks| chunks
.first()
.map_or(false, |c| c.content.trim_start().starts_with('#'))),
"expected first chunk to start with heading"
);
}
#[test]
fn test_config_document_structure_with_headings() {
// Tests document structure with DOCX heading-driven nesting
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"include_document_structure":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
"equals assertion failed"
);
// skipped: field 'document' not available on result type
// skipped: field 'document.nodes' not available on result type
}
#[test]
fn test_config_element_types() {
// Tests element-based result format with element type assertions on DOCX
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"docx/unit_test_headers.docx"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"result_format":"element_based"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert!(
result
.mime_type
.contains(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#),
"expected to contain at least one of the specified values"
);
// skipped: field 'elements' not available on result type
}
#[test]
fn test_config_extraction_timeout() {
// Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"extraction_timeout_secs":300}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
}
#[test]
fn test_config_keywords() {
// Tests keyword extraction via YAKE algorithm
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value =
serde_json::from_str(r#"{"keywords":{"algorithm":"yake","max_keywords":10}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
"expected keywords to be present and non-empty"
);
assert!(
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
"expected >= 1"
);
}
#[test]
fn test_config_pages() {
// Tests page extraction and page marker configuration
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value =
serde_json::from_str(r#"{"pages":{"extract_pages":true,"insert_page_markers":true}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"PAGE"#),
"expected to contain at least one of the specified values"
);
}
#[test]
fn test_config_quality_enabled() {
// Tests quality scoring produces a score value in [0.0, 1.0]
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"enable_quality_processing":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'quality_score' not available on result type
// skipped: field 'quality_score' not available on result type
// skipped: field 'quality_score' not available on result type
}
#[test]
fn test_config_security_limits() {
// Tests archive extraction with custom security limits
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"archives/documents.zip"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(
r#"{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}"#,
)
.unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert!(
result.mime_type.contains(r#"application/zip"#) || result.mime_type.contains(r#"application/x-zip-compressed"#),
"expected to contain at least one of the specified values"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
}
#[test]
fn test_config_tree_sitter() {
// Tests tree-sitter configuration round-trip
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/hello.py");
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/x-source-code"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[test]
fn test_output_format_bytes_markdown() {
// Tests markdown output format via bytes extraction API
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let mime_type = r#"application/pdf"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[test]
fn test_output_format_markdown() {
// Tests Markdown output format
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}

49
e2e/rust/tests/detection_test.rs generated Normal file
View File

@@ -0,0 +1,49 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: detection
use kreuzberg::{detect_mime_type_from_bytes, get_extensions_for_mime};
#[test]
fn test_detect_mime_bytes_html() {
// Detect HTML MIME from bytes
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/html/html.html"
))
.expect("test_documents/html/html.html must exist");
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
}
#[test]
fn test_detect_mime_bytes_pdf() {
// Detect PDF MIME type from bytes
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
}
#[test]
fn test_detect_mime_bytes_png() {
// Detect PNG MIME type from bytes
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/images/test_hello_world.png"
))
.expect("test_documents/images/test_hello_world.png must exist");
let _ = detect_mime_type_from_bytes(&content).expect("should succeed");
}
#[test]
fn test_get_extensions_unknown_mime() {
// get_extensions unknown MIME
let mime_type = r#"application/x-totally-unknown"#;
let result = get_extensions_for_mime(mime_type);
assert!(result.is_err(), "expected call to fail");
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: document_extractor_management
use kreuzberg::{clear_document_extractors, list_document_extractors};
#[test]
fn test_document_extractors_clear() {
// Clear all document extractors and verify list is empty
let _ = clear_document_extractors();
}
#[test]
fn test_extractors_list() {
// List all registered document extractors
let _ = list_document_extractors();
}

View File

@@ -0,0 +1,39 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: embed_async_pending
use kreuzberg::embed_texts_async;
#[tokio::test]
async fn test_embed_texts_async_empty_input() {
// embed_texts_async: empty text list
let texts_json: serde_json::Value = serde_json::from_str(r#"[]"#).unwrap();
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
let config = Default::default();
let result = embed_texts_async(texts, &config).await.expect("should succeed");
assert_eq!(result.len(), 0, "expected exactly 0 elements, got {}", result.len());
}
#[tokio::test]
async fn test_embed_texts_async_happy() {
// embed_texts_async: basic async embedding
let texts_json: serde_json::Value = serde_json::from_str(r#"["First","Second"]"#).unwrap();
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
let config = Default::default();
let result = embed_texts_async(texts, &config).await.expect("should succeed");
assert!(result.len() >= 2, "expected at least 2 elements, got {}", result.len());
}
#[tokio::test]
async fn test_embed_texts_async_preset_switch() {
// embed_texts_async: preset override
let texts_json: serde_json::Value = serde_json::from_str(r#"["Text"]"#).unwrap();
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
let config_json: serde_json::Value =
serde_json::from_str(r#"{"model":{"name":"balanced","type":"preset"}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let _ = embed_texts_async(texts, &config).await.expect("should succeed");
}

19
e2e/rust/tests/embed_extra_test.rs generated Normal file
View File

@@ -0,0 +1,19 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: embed_extra
use kreuzberg::embed_texts;
#[test]
fn test_embed_texts_batch() {
// Batch embed texts
let texts_json: serde_json::Value = serde_json::from_str(r#"["Hello","World"]"#).unwrap();
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
let config_json: serde_json::Value =
serde_json::from_str(r#"{"model":{"name":"balanced","type":"preset"}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let _ = embed_texts(texts, &config).expect("should succeed");
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: embedding_backend_management
use kreuzberg::{clear_embedding_backends, list_embedding_backends};
#[test]
fn test_embedding_backends_clear() {
// Clear all embedding backends and verify list is empty
let _ = clear_embedding_backends();
}
#[test]
fn test_embedding_backends_list() {
// List all registered embedding backends
let _ = list_embedding_backends();
}

49
e2e/rust/tests/embeddings_test.rs generated Normal file
View File

@@ -0,0 +1,49 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: embeddings
use kreuzberg::{embed_texts, get_embedding_preset, list_embedding_presets};
#[test]
fn test_embed_texts_different_preset() {
// embed_texts: multilingual preset
let texts_json: serde_json::Value = serde_json::from_str(r#"["Hello world","Test"]"#).unwrap();
let texts = serde_json::from_value::<Vec<String>>(texts_json).unwrap();
let config_json: serde_json::Value =
serde_json::from_str(r#"{"model":{"name":"multilingual","type":"preset"}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = embed_texts(texts, &config).expect("should succeed");
assert!(result.len() >= 2, "expected at least 2 elements, got {}", result.len());
}
#[test]
fn test_get_embedding_preset_known() {
// get_embedding_preset: known preset
let name = r#"balanced"#;
let _ = get_embedding_preset(name);
}
#[test]
fn test_get_embedding_preset_nominal() {
// get_embedding_preset: nominal case
let name = r#"balanced"#;
let _ = get_embedding_preset(name);
}
#[test]
fn test_get_embedding_preset_unknown() {
// get_embedding_preset: unknown preset fails
let name = r#"nonexistent-xyz"#;
let result = get_embedding_preset(name);
assert!(result.is_none(), "expected Option to be is_none");
}
#[test]
fn test_list_embedding_presets_sanity() {
// list_embedding_presets: returns at least one
let result = list_embedding_presets();
assert!(!result.is_empty(), "expected non-empty value");
}

82
e2e/rust/tests/error_test.rs generated Normal file
View File

@@ -0,0 +1,82 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: error
use kreuzberg::extract_bytes_sync;
#[test]
fn test_error_empty_bytes() {
// Graceful handling of empty bytes (should not error)
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/empty.txt"
))
.expect("test_documents/text/empty.txt must exist");
let mime_type = r#"text/plain"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let _ = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
}
#[test]
fn test_error_empty_mime() {
// Error when extracting with empty MIME type
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/plain.txt"
))
.expect("test_documents/text/plain.txt must exist");
let mime_type = r#""#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config);
assert!(result.is_err(), "expected call to fail");
}
#[test]
fn test_error_extract_bytes_conflicting_ocr() {
// extract_bytes force+disable OCR
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/fake_text.txt"
))
.expect("test_documents/text/fake_text.txt must exist");
let mime_type = r#"text/plain"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true,"force_ocr":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config);
assert!(result.is_err(), "expected call to fail");
}
#[test]
fn test_error_invalid_mime_format() {
// Error when extracting with invalid MIME type format
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/plain.txt"
))
.expect("test_documents/text/plain.txt must exist");
let mime_type = r#"not-a-mime"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config);
assert!(result.is_err(), "expected call to fail");
}
#[test]
fn test_error_unsupported_mime() {
// Error when extracting with unsupported MIME type
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/text/plain.txt"
))
.expect("test_documents/text/plain.txt must exist");
let mime_type = r#"application/x-nonexistent"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config);
assert!(result.is_err(), "expected call to fail");
}

93
e2e/rust/tests/format_specific_test.rs generated Normal file
View File

@@ -0,0 +1,93 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: format_specific
use kreuzberg::{extract_bytes_sync, extract_file_sync};
#[test]
fn test_format_docx_standalone() {
// Standalone DOCX extraction using extract_bytes_sync
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/docx/fake.docx"
))
.expect("test_documents/docx/fake.docx must exist");
let mime_type = r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#;
let config = Default::default();
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
assert!(
result.content.len() >= 20,
"expected length >= 20, got {}",
result.content.len()
);
}
#[test]
fn test_format_hwpx_standalone() {
// Standalone HWPX extraction using extract_bytes_sync
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/hwpx/simple.hwpx"
))
.expect("test_documents/hwpx/simple.hwpx must exist");
let mime_type = r#"application/haansofthwpx"#;
let config = Default::default();
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
assert!(
result.content.len() >= 20,
"expected length >= 20, got {}",
result.content.len()
);
assert!(
format!("{:?}", result.content).contains(r#"Hello from HWPX"#),
"expected to contain: {}",
r#"Hello from HWPX"#
);
}
#[test]
fn test_format_pdf_text() {
// Standalone PDF text extraction using extract_bytes_sync
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let mime_type = r#"application/pdf"#;
let config = Default::default();
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
assert!(
result.content.len() >= 50,
"expected length >= 50, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Mallori"#) || result.content.contains(r#"May"#),
"expected to contain at least one of the specified values"
);
}
#[test]
fn test_format_pptx() {
// PPTX presentation extraction using extract_file_sync
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pptx/simple.pptx");
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.presentationml.presentation"#);
let config = Default::default();
let _ = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
}
#[test]
fn test_format_xlsx() {
// XLSX spreadsheet extraction using extract_file_sync
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"xlsx/stanley_cups.xlsx"
);
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
let config = Default::default();
let _ = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
}

52
e2e/rust/tests/mime_utilities_test.rs generated Normal file
View File

@@ -0,0 +1,52 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: mime_utilities
use kreuzberg::{detect_mime_type_from_bytes, get_extensions_for_mime};
#[test]
fn test_mime_detect_bytes() {
// Detect MIME type from file bytes
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let result = detect_mime_type_from_bytes(&content).expect("should succeed");
assert!(
format!("{:?}", result).contains(r#"pdf"#),
"expected to contain: {}",
r#"pdf"#
);
}
#[test]
fn test_mime_detect_image() {
// Detect MIME type from PNG image bytes
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/images/test_hello_world.png"
))
.expect("test_documents/images/test_hello_world.png must exist");
let result = detect_mime_type_from_bytes(&content).expect("should succeed");
assert!(
format!("{:?}", result).contains(r#"png"#),
"expected to contain: {}",
r#"png"#
);
}
#[test]
fn test_mime_get_extensions() {
// Get file extensions for a MIME type
let mime_type = r#"application/pdf"#;
let result = get_extensions_for_mime(mime_type).expect("should succeed");
assert!(
format!("{:?}", result).contains(r#"pdf"#),
"expected to contain: {}",
r#"pdf"#
);
}

View File

@@ -0,0 +1,27 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: ocr_backend_management
use kreuzberg::{clear_ocr_backends, list_ocr_backends, unregister_ocr_backend};
#[test]
fn test_ocr_backends_clear() {
// Clear all OCR backends and verify list is empty
let _ = clear_ocr_backends();
}
#[test]
fn test_ocr_backends_list() {
// List all registered OCR backends
let _ = list_ocr_backends();
}
#[test]
fn test_ocr_backends_unregister() {
// Unregister nonexistent OCR backend gracefully
let name = r#"nonexistent-backend-xyz"#;
let _ = unregister_ocr_backend(name);
}

34
e2e/rust/tests/pdf_test.rs generated Normal file
View File

@@ -0,0 +1,34 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: pdf
use kreuzberg::render_pdf_page_to_png;
#[test]
fn test_render_pdf_page_first() {
// render_pdf_page_to_png: first page
let pdf_bytes = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let page_index = 0;
let result = render_pdf_page_to_png(&pdf_bytes, page_index, None, None).expect("should succeed");
assert!(result.len() >= 100, "expected length >= 100, got {}", result.len());
}
#[test]
fn test_render_pdf_page_out_of_range() {
// render_pdf_page_to_png: page out of range
let pdf_bytes = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let page_index = 999;
let result = render_pdf_page_to_png(&pdf_bytes, page_index, None, None);
assert!(result.is_err(), "expected call to fail");
}

232
e2e/rust/tests/plugin_api_test.rs generated Normal file
View File

@@ -0,0 +1,232 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: plugin_api
use kreuzberg::{
register_document_extractor, register_embedding_backend, register_ocr_backend, register_post_processor,
register_renderer, register_validator, unregister_document_extractor, unregister_embedding_backend,
unregister_post_processor, unregister_renderer, unregister_validator,
};
#[test]
fn test_register_document_extractor_trait_bridge() {
// register_document_extractor: trait bridge
#[allow(unused_imports)]
use kreuzberg::DocumentExtractor;
#[allow(unused_imports)]
use kreuzberg::ExtractionConfig;
#[allow(unused_imports)]
use kreuzberg::InternalDocument;
struct TestStubRegisterDocumentExtractorTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterDocumentExtractorTraitBridge {
fn name(&self) -> &str {
self._name
}
}
#[async_trait::async_trait]
impl DocumentExtractor for TestStubRegisterDocumentExtractorTraitBridge {
async fn extract_bytes(
&self,
_p0: &[u8],
_p1: &str,
_p2: &ExtractionConfig,
) -> kreuzberg::Result<InternalDocument> {
Ok(InternalDocument::default())
}
fn supported_mime_types(&self) -> &[&str] {
&[]
}
}
let _ = register_document_extractor(std::sync::Arc::new(TestStubRegisterDocumentExtractorTraitBridge {
_name: "test-extractor",
}));
}
#[test]
fn test_register_embedding_backend_trait_bridge() {
// register_embedding_backend: trait bridge
#[allow(unused_imports)]
use kreuzberg::EmbeddingBackend;
struct TestStubRegisterEmbeddingBackendTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterEmbeddingBackendTraitBridge {
fn name(&self) -> &str {
self._name
}
}
#[async_trait::async_trait]
impl EmbeddingBackend for TestStubRegisterEmbeddingBackendTraitBridge {
fn dimensions(&self) -> usize {
0
}
async fn embed(&self, _p0: Vec<String>) -> kreuzberg::Result<Vec<Vec<f32>>> {
Ok(Vec::new())
}
}
let _ = register_embedding_backend(std::sync::Arc::new(TestStubRegisterEmbeddingBackendTraitBridge {
_name: "test-embedding-backend",
}));
}
#[test]
fn test_register_ocr_backend_trait_bridge() {
// register_ocr_backend: trait bridge
#[allow(unused_imports)]
use kreuzberg::ExtractionResult;
#[allow(unused_imports)]
use kreuzberg::OcrBackend;
#[allow(unused_imports)]
use kreuzberg::OcrBackendType;
#[allow(unused_imports)]
use kreuzberg::OcrConfig;
struct TestStubRegisterOcrBackendTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterOcrBackendTraitBridge {
fn name(&self) -> &str {
self._name
}
}
#[async_trait::async_trait]
impl OcrBackend for TestStubRegisterOcrBackendTraitBridge {
async fn process_image(&self, _p0: &[u8], _p1: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
Ok(ExtractionResult::default())
}
fn supports_language(&self, _p0: &str) -> bool {
false
}
fn backend_type(&self) -> OcrBackendType {
OcrBackendType::default()
}
}
let _ = register_ocr_backend(std::sync::Arc::new(TestStubRegisterOcrBackendTraitBridge {
_name: "test-backend",
}));
}
#[test]
fn test_register_post_processor_trait_bridge() {
// register_post_processor: trait bridge
#[allow(unused_imports)]
use kreuzberg::ExtractionConfig;
#[allow(unused_imports)]
use kreuzberg::ExtractionResult;
#[allow(unused_imports)]
use kreuzberg::PostProcessor;
#[allow(unused_imports)]
use kreuzberg::ProcessingStage;
struct TestStubRegisterPostProcessorTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterPostProcessorTraitBridge {
fn name(&self) -> &str {
self._name
}
}
#[async_trait::async_trait]
impl PostProcessor for TestStubRegisterPostProcessorTraitBridge {
async fn process(&self, _p0: &mut ExtractionResult, _p1: &ExtractionConfig) -> kreuzberg::Result<()> {
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::default()
}
}
let _ = register_post_processor(std::sync::Arc::new(TestStubRegisterPostProcessorTraitBridge {
_name: "test-processor",
}));
}
#[test]
fn test_register_renderer_trait_bridge() {
// register_renderer: trait bridge
#[allow(unused_imports)]
use kreuzberg::InternalDocument;
#[allow(unused_imports)]
use kreuzberg::Renderer;
struct TestStubRegisterRendererTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterRendererTraitBridge {
fn name(&self) -> &str {
self._name
}
}
impl Renderer for TestStubRegisterRendererTraitBridge {
fn render(&self, _p0: &InternalDocument) -> kreuzberg::Result<String> {
Ok(String::new())
}
}
let _ = register_renderer(std::sync::Arc::new(TestStubRegisterRendererTraitBridge {
_name: "test-renderer",
}));
}
#[test]
fn test_register_validator_trait_bridge() {
// register_validator: trait bridge
#[allow(unused_imports)]
use kreuzberg::ExtractionConfig;
#[allow(unused_imports)]
use kreuzberg::ExtractionResult;
#[allow(unused_imports)]
use kreuzberg::Validator;
struct TestStubRegisterValidatorTraitBridge {
_name: &'static str,
}
impl kreuzberg::plugins::Plugin for TestStubRegisterValidatorTraitBridge {
fn name(&self) -> &str {
self._name
}
}
#[async_trait::async_trait]
impl Validator for TestStubRegisterValidatorTraitBridge {
async fn validate(&self, _p0: &ExtractionResult, _p1: &ExtractionConfig) -> kreuzberg::Result<()> {
Ok(())
}
}
let _ = register_validator(std::sync::Arc::new(TestStubRegisterValidatorTraitBridge {
_name: "test-validator",
}));
}
#[test]
fn test_unregister_document_extractor_after_register() {
// unregister_document_extractor
let name = r#"test-extractor"#;
let _ = unregister_document_extractor(name);
}
#[test]
fn test_unregister_embedding_backend_after_register() {
// unregister_embedding_backend
let name = r#"test-embedding-backend"#;
let _ = unregister_embedding_backend(name);
}
#[test]
fn test_unregister_post_processor_after_register() {
// unregister_post_processor
let name = r#"test-processor"#;
let _ = unregister_post_processor(name);
}
#[test]
fn test_unregister_renderer_after_register() {
// unregister_renderer
let name = r#"test-renderer"#;
let _ = unregister_renderer(name);
}
#[test]
fn test_unregister_validator_after_register() {
// unregister_validator
let name = r#"test-validator"#;
let _ = unregister_validator(name);
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: post_processor_management
use kreuzberg::{clear_post_processors, list_post_processors};
#[test]
fn test_post_processors_clear() {
// Clear all post-processors and verify list is empty
let _ = clear_post_processors();
}
#[test]
fn test_post_processors_list() {
// List all registered post-processors
let _ = list_post_processors();
}

View File

@@ -0,0 +1,29 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: registry_operations
use kreuzberg::get_extensions_for_mime;
#[test]
fn test_extensions_docx() {
// Get file extensions for DOCX MIME type
let mime_type = r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#;
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
}
#[test]
fn test_extensions_html() {
// Get file extensions for HTML MIME type
let mime_type = r#"text/html"#;
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
}
#[test]
fn test_extensions_pdf() {
// Get file extensions for PDF MIME type
let mime_type = r#"application/pdf"#;
let _ = get_extensions_for_mime(mime_type).expect("should succeed");
}

47
e2e/rust/tests/registry_test.rs generated Normal file
View File

@@ -0,0 +1,47 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: registry
use kreuzberg::{
list_document_extractors, list_embedding_backends, list_ocr_backends, list_post_processors, list_renderers,
list_validators,
};
#[test]
fn test_list_document_extractors() {
// List document extractors
let _ = list_document_extractors();
}
#[test]
fn test_list_embedding_backends() {
// List embedding backends
let _ = list_embedding_backends();
}
#[test]
fn test_list_ocr_backends() {
// List OCR backends
let _ = list_ocr_backends();
}
#[test]
fn test_list_post_processors() {
// List post-processors
let _ = list_post_processors();
}
#[test]
fn test_list_renderers() {
// List renderers
let _ = list_renderers();
}
#[test]
fn test_list_validators() {
// List validators
let _ = list_validators();
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: renderer_management
use kreuzberg::{clear_renderers, list_renderers};
#[test]
fn test_renderers_clear() {
// Clear all renderers and verify list is empty
let _ = clear_renderers();
}
#[test]
fn test_renderers_list() {
// List all registered renderers
let _ = list_renderers();
}

272
e2e/rust/tests/smoke_test.rs generated Normal file
View File

@@ -0,0 +1,272 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: smoke
use kreuzberg::{extract_bytes, extract_file};
#[tokio::test]
async fn test_ocr_image_png() {
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/images/test_hello_world.png"
))
.expect("test_documents/images/test_hello_world.png must exist");
let mime_type = r#"image/png"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes(&content, mime_type, &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"image/png"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 1,
"expected length >= 1, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Hello"#)
|| result.content.contains(r#"World"#)
|| result.content.contains(r#"hello"#)
|| result.content.contains(r#"world"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_docx_basic() {
// Smoke test: DOCX with formatted text
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 20,
"expected length >= 20, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Lorem"#)
|| result.content.contains(r#"ipsum"#)
|| result.content.contains(r#"document"#)
|| result.content.contains(r#"text"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_html_basic() {
// Smoke test: HTML table extraction
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"html/simple_table.html"
);
let mime_type = Some(r#"text/html"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/html"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"Sample Data Table"#)
|| result.content.contains(r#"Laptop"#)
|| result.content.contains(r#"Electronics"#)
|| result.content.contains(r#"Product"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_image_png() {
// Smoke test: PNG image (without OCR, metadata only)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"images/sample.png"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"disable_ocr":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"image/png"#,
"equals assertion failed"
);
}
#[tokio::test]
async fn test_smoke_json_basic() {
// Smoke test: JSON file extraction
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "json/simple.json");
let mime_type = Some(r#"application/json"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/json"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[tokio::test]
async fn test_smoke_pdf_basic() {
// Smoke test: PDF with simple text extraction
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type = Some(r#"application/pdf"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 50,
"expected length >= 50, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"To Whom it May Concern"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_smoke_txt_basic() {
// Smoke test: Plain text file
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "text/report.txt");
let mime_type = Some(r#"text/plain"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/plain"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[tokio::test]
async fn test_smoke_xlsx_basic() {
// Smoke test: XLSX with basic spreadsheet data including tables
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"xlsx/stanley_cups.xlsx"
);
let mime_type = Some(r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#);
let config_json: serde_json::Value = serde_json::from_str(r#"{}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 100,
"expected length >= 100, got {}",
result.content.len()
);
assert!(
format!("{:?}", result.content).contains(r#"Team"#),
"expected to contain: {}",
r#"Team"#
);
assert!(
format!("{:?}", result.content).contains(r#"Location"#),
"expected to contain: {}",
r#"Location"#
);
assert!(
format!("{:?}", result.content).contains(r#"Stanley Cups"#),
"expected to contain: {}",
r#"Stanley Cups"#
);
assert!(
format!("{:?}", result.content).contains(r#"Blues"#),
"expected to contain: {}",
r#"Blues"#
);
assert!(
format!("{:?}", result.content).contains(r#"Flyers"#),
"expected to contain: {}",
r#"Flyers"#
);
assert!(
format!("{:?}", result.content).contains(r#"Maple Leafs"#),
"expected to contain: {}",
r#"Maple Leafs"#
);
assert!(
format!("{:?}", result.content).contains(r#"STL"#),
"expected to contain: {}",
r#"STL"#
);
assert!(
format!("{:?}", result.content).contains(r#"PHI"#),
"expected to contain: {}",
r#"PHI"#
);
assert!(
format!("{:?}", result.content).contains(r#"TOR"#),
"expected to contain: {}",
r#"TOR"#
);
// skipped: field 'tables' not available on result type
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
}

View File

@@ -0,0 +1,20 @@
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: validator_management
use kreuzberg::{clear_validators, list_validators};
#[test]
fn test_validators_clear() {
// Clear all validators and verify list is empty
let _ = clear_validators();
}
#[test]
fn test_validators_list() {
// List all registered validators
let _ = list_validators();
}