This commit is contained in:
88
crates/kreuzberg/tests/jsonl_integration.rs
Normal file
88
crates/kreuzberg/tests/jsonl_integration.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
//! JSONL (newline-delimited JSON) integration tests.
|
||||
//!
|
||||
//! Tests end-to-end extraction of `.jsonl` / `.ndjson` files through the full
|
||||
//! extraction pipeline, verifying content preservation, metadata extraction,
|
||||
//! and blank-line handling.
|
||||
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
use kreuzberg::core::extractor::{extract_bytes, extract_file};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_jsonl_file() {
|
||||
let config = ExtractionConfig::default();
|
||||
let path = std::path::Path::new(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/jsonl/simple.jsonl"
|
||||
));
|
||||
|
||||
let result = extract_file(path, None, &config)
|
||||
.await
|
||||
.expect("JSONL file extraction should succeed");
|
||||
|
||||
assert!(result.content.contains("Alice"), "Should contain Alice");
|
||||
assert!(result.content.contains("Bob"), "Should contain Bob");
|
||||
assert!(result.content.contains("Carol"), "Should contain Carol");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_jsonl_bytes() {
|
||||
let config = ExtractionConfig::default();
|
||||
let jsonl = b"{\"name\": \"Alice\"}\n{\"name\": \"Bob\"}";
|
||||
|
||||
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||||
.await
|
||||
.expect("JSONL bytes extraction should succeed");
|
||||
|
||||
assert!(result.content.contains("Alice"));
|
||||
assert!(result.content.contains("Bob"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_jsonl_metadata() {
|
||||
let config = ExtractionConfig::default();
|
||||
let jsonl = b"{\"title\": \"Doc One\"}\n{\"title\": \"Doc Two\"}";
|
||||
|
||||
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||||
.await
|
||||
.expect("JSONL metadata extraction should succeed");
|
||||
|
||||
let data_format = result.metadata.additional.get("data_format");
|
||||
assert!(data_format.is_some(), "Metadata should contain data_format");
|
||||
assert_eq!(
|
||||
data_format.unwrap().as_str().unwrap(),
|
||||
"jsonl",
|
||||
"data_format should be 'jsonl'"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_jsonl_empty_lines() {
|
||||
let config = ExtractionConfig::default();
|
||||
let path = std::path::Path::new(concat!(
|
||||
env!("CARGO_MANIFEST_DIR"),
|
||||
"/../../test_documents/jsonl/with_blanks.jsonl"
|
||||
));
|
||||
|
||||
let result = extract_file(path, None, &config)
|
||||
.await
|
||||
.expect("JSONL with blanks should succeed");
|
||||
|
||||
assert!(result.content.contains("First"), "Should contain First");
|
||||
assert!(result.content.contains("Second"), "Should contain Second");
|
||||
assert!(result.content.contains("Third"), "Should contain Third");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_extract_jsonl_content_contains_all_objects() {
|
||||
let config = ExtractionConfig::default();
|
||||
let jsonl = b"{\"a\": 1}\n{\"b\": 2}\n{\"c\": 3}";
|
||||
|
||||
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||||
.await
|
||||
.expect("JSONL extraction should succeed");
|
||||
|
||||
// Content is pretty-printed JSON array
|
||||
assert!(result.content.contains("\"a\": 1"));
|
||||
assert!(result.content.contains("\"b\": 2"));
|
||||
assert!(result.content.contains("\"c\": 3"));
|
||||
}
|
||||
Reference in New Issue
Block a user