89 lines
3.0 KiB
Rust
89 lines
3.0 KiB
Rust
|
|
//! JSONL (newline-delimited JSON) integration tests.
|
||
|
|
//!
|
||
|
|
//! Tests end-to-end extraction of `.jsonl` / `.ndjson` files through the full
|
||
|
|
//! extraction pipeline, verifying content preservation, metadata extraction,
|
||
|
|
//! and blank-line handling.
|
||
|
|
|
||
|
|
use kreuzberg::core::config::ExtractionConfig;
|
||
|
|
use kreuzberg::core::extractor::{extract_bytes, extract_file};
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_jsonl_file() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let path = std::path::Path::new(concat!(
|
||
|
|
env!("CARGO_MANIFEST_DIR"),
|
||
|
|
"/../../test_documents/jsonl/simple.jsonl"
|
||
|
|
));
|
||
|
|
|
||
|
|
let result = extract_file(path, None, &config)
|
||
|
|
.await
|
||
|
|
.expect("JSONL file extraction should succeed");
|
||
|
|
|
||
|
|
assert!(result.content.contains("Alice"), "Should contain Alice");
|
||
|
|
assert!(result.content.contains("Bob"), "Should contain Bob");
|
||
|
|
assert!(result.content.contains("Carol"), "Should contain Carol");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_jsonl_bytes() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let jsonl = b"{\"name\": \"Alice\"}\n{\"name\": \"Bob\"}";
|
||
|
|
|
||
|
|
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||
|
|
.await
|
||
|
|
.expect("JSONL bytes extraction should succeed");
|
||
|
|
|
||
|
|
assert!(result.content.contains("Alice"));
|
||
|
|
assert!(result.content.contains("Bob"));
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_jsonl_metadata() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let jsonl = b"{\"title\": \"Doc One\"}\n{\"title\": \"Doc Two\"}";
|
||
|
|
|
||
|
|
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||
|
|
.await
|
||
|
|
.expect("JSONL metadata extraction should succeed");
|
||
|
|
|
||
|
|
let data_format = result.metadata.additional.get("data_format");
|
||
|
|
assert!(data_format.is_some(), "Metadata should contain data_format");
|
||
|
|
assert_eq!(
|
||
|
|
data_format.unwrap().as_str().unwrap(),
|
||
|
|
"jsonl",
|
||
|
|
"data_format should be 'jsonl'"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_jsonl_empty_lines() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let path = std::path::Path::new(concat!(
|
||
|
|
env!("CARGO_MANIFEST_DIR"),
|
||
|
|
"/../../test_documents/jsonl/with_blanks.jsonl"
|
||
|
|
));
|
||
|
|
|
||
|
|
let result = extract_file(path, None, &config)
|
||
|
|
.await
|
||
|
|
.expect("JSONL with blanks should succeed");
|
||
|
|
|
||
|
|
assert!(result.content.contains("First"), "Should contain First");
|
||
|
|
assert!(result.content.contains("Second"), "Should contain Second");
|
||
|
|
assert!(result.content.contains("Third"), "Should contain Third");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_extract_jsonl_content_contains_all_objects() {
|
||
|
|
let config = ExtractionConfig::default();
|
||
|
|
let jsonl = b"{\"a\": 1}\n{\"b\": 2}\n{\"c\": 3}";
|
||
|
|
|
||
|
|
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
|
||
|
|
.await
|
||
|
|
.expect("JSONL extraction should succeed");
|
||
|
|
|
||
|
|
// Content is pretty-printed JSON array
|
||
|
|
assert!(result.content.contains("\"a\": 1"));
|
||
|
|
assert!(result.content.contains("\"b\": 2"));
|
||
|
|
assert!(result.content.contains("\"c\": 3"));
|
||
|
|
}
|