// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef //! E2e tests for category: contract use kreuzberg::{extract_bytes_sync, extract_file, extract_file_sync}; #[tokio::test] async fn test_api_batch_bytes_async() { // Tests async batch bytes extraction API (batch_extract_bytes) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config = Default::default(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_api_batch_bytes_with_configs_async() { // Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); let _metadata_output_format = result .metadata .output_format .as_ref() .map(|v| v.to_string()) .unwrap_or_default(); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); // skipped: field 'metadata.output_format' not available on result type } #[tokio::test] async fn test_api_batch_file_async() { // Tests async batch file extraction API (batch_extract_file) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config = Default::default(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_api_batch_file_with_configs_async() { // Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); let _metadata_output_format = result .metadata .output_format .as_ref() .map(|v| v.to_string()) .unwrap_or_default(); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); // skipped: field 'metadata.output_format' not available on result type } #[tokio::test] async fn test_api_extract_bytes_async() { // Tests async bytes extraction API (extract_bytes) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config = Default::default(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#), "expected to contain at least one of the specified values" ); } #[tokio::test] async fn test_api_extract_file_async() { // Tests async file extraction API (extract_file) let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config = Default::default(); let result = extract_file(path, mime_type.as_deref(), &config) .await .expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"May 5, 2023"#) || result.content.contains(r#"Mallori"#), "expected to contain at least one of the specified values" ); } #[test] fn test_config_chunking_prepend_heading_context() { // Tests markdown chunker prepends heading hierarchy to chunk content let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "markdown/extraction_test.md" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str( r#"{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}"#, ) .unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); let chunks = &result.chunks; assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( chunks.as_ref().map_or(0, |v| v.len()) >= 2 as usize, "expected >= 2 chunks" ); assert!( result .chunks .as_ref() .is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())), "expected all chunks to have content" ); assert!( result .chunks .as_ref() .is_some_and(|chunks| !chunks.is_empty() && chunks.iter().all(|c| !c.content.is_empty())), "expected chunks with heading context" ); assert!( result.chunks.as_ref().is_some_and(|chunks| chunks .first() .map_or(false, |c| c.content.trim_start().starts_with('#'))), "expected first chunk to start with heading" ); } #[test] fn test_config_document_structure_with_headings() { // Tests document structure with DOCX heading-driven nesting let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/fake.docx"); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"include_document_structure":true}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#, "equals assertion failed" ); // skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type } #[test] fn test_config_element_types() { // Tests element-based result format with element type assertions on DOCX let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "docx/unit_test_headers.docx" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"result_format":"element_based"}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert!( result .mime_type .contains(r#"application/vnd.openxmlformats-officedocument.wordprocessingml.document"#), "expected to contain at least one of the specified values" ); // skipped: field 'elements' not available on result type } #[test] fn test_config_extraction_timeout() { // Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"extraction_timeout_secs":300}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); } #[test] fn test_config_keywords() { // Tests keyword extraction via YAKE algorithm let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"keywords":{"algorithm":"yake","max_keywords":10}}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()), "expected keywords to be present and non-empty" ); assert!( result.extracted_keywords.as_ref().is_some_and(|v| !v.is_empty()), "expected >= 1" ); } #[test] fn test_config_pages() { // Tests page extraction and page marker configuration let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"pages":{"extract_pages":true,"insert_page_markers":true}}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); assert!( result.content.contains(r#"PAGE"#), "expected to contain at least one of the specified values" ); } #[test] fn test_config_quality_enabled() { // Tests quality scoring produces a score value in [0.0, 1.0] let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"enable_quality_processing":true}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type } #[test] fn test_config_security_limits() { // Tests archive extraction with custom security limits let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "archives/documents.zip" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str( r#"{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}"#, ) .unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert!( result.mime_type.contains(r#"application/zip"#) || result.mime_type.contains(r#"application/x-zip-compressed"#), "expected to contain at least one of the specified values" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); } #[test] fn test_config_tree_sitter() { // Tests tree-sitter configuration round-trip let path: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "code/hello.py"); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"text/x-source-code"#, "equals assertion failed" ); assert!( result.content.len() >= 5, "expected length >= 5, got {}", result.content.len() ); } #[test] fn test_output_format_bytes_markdown() { // Tests markdown output format via bytes extraction API let content = std::fs::read(concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/pdf/fake_memo.pdf" )) .expect("test_documents/pdf/fake_memo.pdf must exist"); let mime_type = r#"application/pdf"#; let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_bytes_sync(&content, mime_type, &config).expect("should succeed"); let _metadata_output_format = result .metadata .output_format .as_ref() .map(|v| v.to_string()) .unwrap_or_default(); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); // skipped: field 'metadata.output_format' not available on result type } #[test] fn test_output_format_markdown() { // Tests Markdown output format let path: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../../test_documents/", "pdf/fake_memo.pdf" ); let mime_type: Option = None; let config_json: serde_json::Value = serde_json::from_str(r#"{"output_format":"markdown"}"#).unwrap(); let config = serde_json::from_value(config_json).unwrap(); let result = extract_file_sync(path, mime_type.as_deref(), &config).expect("should succeed"); let _metadata_output_format = result .metadata .output_format .as_ref() .map(|v| v.to_string()) .unwrap_or_default(); assert_eq!( result.mime_type.to_string().as_str().trim(), r#"application/pdf"#, "equals assertion failed" ); assert!( result.content.len() >= 10, "expected length >= 10, got {}", result.content.len() ); // skipped: field 'metadata.output_format' not available on result type }