Files
fil/crates/kreuzberg/tests/jsonl_integration.rs

89 lines
3.0 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! JSONL (newline-delimited JSON) integration tests.
//!
//! Tests end-to-end extraction of `.jsonl` / `.ndjson` files through the full
//! extraction pipeline, verifying content preservation, metadata extraction,
//! and blank-line handling.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::{extract_bytes, extract_file};
#[tokio::test]
async fn test_extract_jsonl_file() {
let config = ExtractionConfig::default();
let path = std::path::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/jsonl/simple.jsonl"
));
let result = extract_file(path, None, &config)
.await
.expect("JSONL file extraction should succeed");
assert!(result.content.contains("Alice"), "Should contain Alice");
assert!(result.content.contains("Bob"), "Should contain Bob");
assert!(result.content.contains("Carol"), "Should contain Carol");
}
#[tokio::test]
async fn test_extract_jsonl_bytes() {
let config = ExtractionConfig::default();
let jsonl = b"{\"name\": \"Alice\"}\n{\"name\": \"Bob\"}";
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
.await
.expect("JSONL bytes extraction should succeed");
assert!(result.content.contains("Alice"));
assert!(result.content.contains("Bob"));
}
#[tokio::test]
async fn test_extract_jsonl_metadata() {
let config = ExtractionConfig::default();
let jsonl = b"{\"title\": \"Doc One\"}\n{\"title\": \"Doc Two\"}";
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
.await
.expect("JSONL metadata extraction should succeed");
let data_format = result.metadata.additional.get("data_format");
assert!(data_format.is_some(), "Metadata should contain data_format");
assert_eq!(
data_format.unwrap().as_str().unwrap(),
"jsonl",
"data_format should be 'jsonl'"
);
}
#[tokio::test]
async fn test_extract_jsonl_empty_lines() {
let config = ExtractionConfig::default();
let path = std::path::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../test_documents/jsonl/with_blanks.jsonl"
));
let result = extract_file(path, None, &config)
.await
.expect("JSONL with blanks should succeed");
assert!(result.content.contains("First"), "Should contain First");
assert!(result.content.contains("Second"), "Should contain Second");
assert!(result.content.contains("Third"), "Should contain Third");
}
#[tokio::test]
async fn test_extract_jsonl_content_contains_all_objects() {
let config = ExtractionConfig::default();
let jsonl = b"{\"a\": 1}\n{\"b\": 2}\n{\"c\": 3}";
let result = extract_bytes(jsonl, "application/x-ndjson", &config)
.await
.expect("JSONL extraction should succeed");
// Content is pretty-printed JSON array
assert!(result.content.contains("\"a\": 1"));
assert!(result.content.contains("\"b\": 2"));
assert!(result.content.contains("\"c\": 3"));
}