Files
fil/e2e/rust/tests/contract_test.rs
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

497 lines
17 KiB
Rust
Generated

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
//! E2e tests for category: contract
use kreuzberg::{extract_bytes_sync, extract_file, extract_file_sync};
#[tokio::test]
async fn test_api_batch_bytes_async() {
// Tests async batch bytes extraction API (batch_extract_bytes)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_batch_bytes_with_configs_async() {
// Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[tokio::test]
async fn test_api_batch_file_async() {
// Tests async batch file extraction API (batch_extract_file)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_batch_file_with_configs_async() {
// Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[tokio::test]
async fn test_api_extract_bytes_async() {
// Tests async bytes extraction API (extract_bytes)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[tokio::test]
async fn test_api_extract_file_async() {
// Tests async file extraction API (extract_file)
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config = Default::default();
let result = extract_file(path, mime_type.as_deref(), &config)
.await
.expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#),
"expected to contain at least one of the specified values"
);
}
#[test]
fn test_config_chunking_prepend_heading_context() {
// Tests markdown chunker prepends heading hierarchy to chunk content
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"markdown/extraction_test.md"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(
r#"{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}"#,
)
.unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
let chunks = &result.chunks;
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
chunks.as_ref().map_or(0, |v| v.len()) >= 2 as usize,
"expected >= 2 chunks"
);
assert!(
result
.chunks
.as_ref()
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
"expected all chunks to have content"
);
assert!(
result
.chunks
.as_ref()
.is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())),
"expected chunks with heading context"
);
assert!(
result.chunks.as_ref().is_some_and(|chunks| chunks
.first()
.map_or(false, |c| c.content.trim_start().starts_with('#'))),
"expected first chunk to start with heading"
);
}
#[test]
fn test_config_document_structure_with_headings() {
// Tests document structure with DOCX heading-driven nesting
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx");
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"include_document_structure":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#,
"equals assertion failed"
);
// skipped: field 'document' not available on result type
// skipped: field 'document.nodes' not available on result type
}
#[test]
fn test_config_element_types() {
// Tests element-based result format with element type assertions on DOCX
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"docx/unit_test_headers.docx"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"result_format":"element_based"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert!(
result
.mime_type
.contains(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#),
"expected to contain at least one of the specified values"
);
// skipped: field 'elements' not available on result type
}
#[test]
fn test_config_extraction_timeout() {
// Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"extraction_timeout_secs":300}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
}
#[test]
fn test_config_keywords() {
// Tests keyword extraction via YAKE algorithm
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value =
serde_json::from_str(r#"{"keywords":{"algorithm":"yake","max_keywords":10}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
"expected keywords to be present and non-empty"
);
assert!(
result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()),
"expected >= 1"
);
}
#[test]
fn test_config_pages() {
// Tests page extraction and page marker configuration
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value =
serde_json::from_str(r#"{"pages":{"extract_pages":true,"insert_page_markers":true}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
assert!(
result.content.contains(r#"PAGE"#),
"expected to contain at least one of the specified values"
);
}
#[test]
fn test_config_quality_enabled() {
// Tests quality scoring produces a score value in [0.0, 1.0]
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"enable_quality_processing":true}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'quality_score' not available on result type
// skipped: field 'quality_score' not available on result type
// skipped: field 'quality_score' not available on result type
}
#[test]
fn test_config_security_limits() {
// Tests archive extraction with custom security limits
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"archives/documents.zip"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(
r#"{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}"#,
)
.unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert!(
result.mime_type.contains(r#"application/zip"#) || result.mime_type.contains(r#"application/x-zip-compressed"#),
"expected to contain at least one of the specified values"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
}
#[test]
fn test_config_tree_sitter() {
// Tests tree-sitter configuration round-trip
let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/hello.py");
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"text/x-source-code"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 5,
"expected length >= 5, got {}",
result.content.len()
);
}
#[test]
fn test_output_format_bytes_markdown() {
// Tests markdown output format via bytes extraction API
let content = std::fs::read(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/pdf/fake_memo.pdf"
))
.expect("test_documents/pdf/fake_memo.pdf must exist");
let mime_type = r#"application/pdf"#;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}
#[test]
fn test_output_format_markdown() {
// Tests Markdown output format
let path: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/",
"pdf/fake_memo.pdf"
);
let mime_type: Option<String> = None;
let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap();
let config = serde_json::from_value(config_json).unwrap();
let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed");
let _metadata_output_format = result
.metadata
.output_format
.as_ref()
.map(|v| v.to_string())
.unwrap_or_default();
assert_eq!(
result.mime_type.to_string().as_str().trim(),
r#"application/pdf"#,
"equals assertion failed"
);
assert!(
result.content.len() >= 10,
"expected length >= 10, got {}",
result.content.len()
);
// skipped: field 'metadata.output_format' not available on result type
}