Files
fil/crates/kreuzberg/tests/json_schema_validation.rs

179 lines
5.5 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Validates that `render_json()` output conforms to the JSON Schema
//! defined in `docs/schemas/document-content.schema.json`.
use bytes::Bytes;
use kreuzberg::rendering::render_json;
use kreuzberg::types::extraction::ExtractedImage;
use kreuzberg::types::internal_builder::InternalDocumentBuilder;
use std::borrow::Cow;
/// Load the JSON Schema once and return the parsed value.
fn load_schema() -> serde_json::Value {
let raw = include_str!("../../../docs/schemas/document-content.schema.json");
serde_json::from_str(raw).expect("schema is valid JSON")
}
/// Validate a JSON string against the document-content schema.
/// Panics with a descriptive message on validation failure.
fn validate_json(json_str: &str) {
let schema = load_schema();
let instance: serde_json::Value = serde_json::from_str(json_str).expect("output is valid JSON");
let validator = jsonschema::validator_for(&schema).expect("schema compiles");
let mut errors = Vec::new();
for error in validator.iter_errors(&instance) {
errors.push(format!(" - {error}"));
}
if !errors.is_empty() {
panic!(
"JSON Schema validation failed for document:\n{}\nErrors:\n{}",
serde_json::to_string_pretty(&instance).unwrap_or_default(),
errors.join("\n")
);
}
}
fn make_image(description: Option<String>, source_path: Option<String>, data: Bytes) -> ExtractedImage {
ExtractedImage {
data,
format: Cow::Borrowed("png"),
image_index: 0,
page_number: None,
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description,
ocr_result: None,
bounding_box: None,
source_path,
image_kind: None,
kind_confidence: None,
cluster_id: None,
}
}
#[test]
fn test_empty_document_validates() {
let b = InternalDocumentBuilder::new("test");
let doc = b.build();
let json_str = render_json(&doc);
validate_json(&json_str);
}
#[test]
fn test_headings_and_paragraphs_validate() {
let mut b = InternalDocumentBuilder::new("test");
b.push_title("My Document", None, None);
b.push_heading(1, "Introduction", None, None);
b.push_paragraph("This is the intro.", vec![], None, None);
b.push_heading(2, "Details", None, None);
b.push_paragraph("Some details here.", vec![], None, None);
b.push_heading(1, "Conclusion", None, None);
b.push_paragraph("Final thoughts.", vec![], None, None);
let doc = b.build();
let json_str = render_json(&doc);
validate_json(&json_str);
}
#[test]
fn test_tables_code_blocks_and_lists_validate() {
let mut b = InternalDocumentBuilder::new("test");
// Table
let cells = vec![
vec!["Name".to_string(), "Age".to_string()],
vec!["Alice".to_string(), "30".to_string()],
vec!["Bob".to_string(), "25".to_string()],
];
b.push_table_from_cells(&cells, None, None);
// Code block with language
b.push_code("fn main() {}", Some("rust"), None, None);
// Code block without language
b.push_code("echo hello", None, None, None);
// Unordered list
b.push_list(false);
b.push_list_item("Apples", false, vec![], None, None);
b.push_list_item("Bananas", false, vec![], None, None);
b.end_list();
// Ordered list
b.push_list(true);
b.push_list_item("First", true, vec![], None, None);
b.push_list_item("Second", true, vec![], None, None);
b.end_list();
let doc = b.build();
let json_str = render_json(&doc);
validate_json(&json_str);
}
#[test]
fn test_images_formulas_blockquotes_validate() {
let mut b = InternalDocumentBuilder::new("test");
// Image with description and data
let image = make_image(
Some("A diagram".to_string()),
None,
Bytes::from_static(b"fake-png-data"),
);
b.push_image(Some("A diagram"), image, None, None);
// Image with no description and a source path
let image2 = make_image(None, Some("/images/photo.jpg".to_string()), Bytes::new());
b.push_image(None, image2, None, None);
// Formula
b.push_formula("E = mc^2", None, None);
b.push_formula("\\int_0^1 x^2 dx", None, None);
// Blockquote
b.push_quote_start();
b.push_paragraph("To be or not to be.", vec![], None, None);
b.push_quote_end();
let doc = b.build();
let json_str = render_json(&doc);
validate_json(&json_str);
}
#[test]
fn test_complex_nested_document_validates() {
let mut b = InternalDocumentBuilder::new("test");
b.push_title("Complex Document", None, None);
// Section with mixed content
b.push_heading(1, "Overview", None, None);
b.push_paragraph("This document tests all node types.", vec![], None, None);
// Nested section
b.push_heading(2, "Data", None, None);
let cells = vec![
vec!["Key".to_string(), "Value".to_string()],
vec!["x".to_string(), "1".to_string()],
];
b.push_table_from_cells(&cells, None, None);
b.push_code("let x = 1;", Some("rust"), None, None);
// Another top-level section
b.push_heading(1, "Appendix", None, None);
b.push_formula("a^2 + b^2 = c^2", None, None);
b.push_list(false);
b.push_list_item("Note 1", false, vec![], None, None);
b.push_list_item("Note 2", false, vec![], None, None);
b.end_list();
b.push_quote_start();
b.push_paragraph("Important quote.", vec![], None, None);
b.push_quote_end();
let doc = b.build();
let json_str = render_json(&doc);
validate_json(&json_str);
}