//! Tests verifying XML extraction produces embedding-friendly hierarchical output.
//!
//! Indented output preserves document structure so that related elements
//! (e.g. a plant's name and zone) stay grouped together for vector search.
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
/// Sibling elements should be grouped under their parent with indentation.
#[tokio::test]
async fn test_xml_preserves_hierarchy() {
let config = ExtractionConfig::default();
let xml = br#"Bloodroot4"#;
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
// PLANT children should be indented under PLANT
assert!(result.content.contains("PLANT"));
assert!(result.content.contains(" COMMON\n Bloodroot"));
assert!(result.content.contains(" ZONE\n 4"));
}
/// Deeper nesting should produce deeper indentation.
#[tokio::test]
async fn test_xml_indentation_shows_nesting() {
let config = ExtractionConfig::default();
let xml = b"Deep";
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(result.content.contains(" grandchild\n Deep"));
}
/// Attributes should appear inline with the element label, in any order.
#[tokio::test]
async fn test_xml_attributes_inline() {
let config = ExtractionConfig::default();
let xml = br#"- Title
"#;
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
// Both attributes must appear inline with the `item` label, but order
// is not part of the contract — `AHashMap` iteration is non-deterministic
// and the renderer sorts alphabetically for stability across runs.
let item_line = result
.content
.lines()
.find(|l| l.contains("item ("))
.expect("expected an `item (...)` label line");
assert!(item_line.contains("type: book"), "missing type attr in: {item_line:?}");
assert!(item_line.contains("id: 42"), "missing id attr in: {item_line:?}");
assert!(result.content.contains("Title"));
}
/// Sibling groups should be separated by a blank line, regardless of the
/// indent depth at which the siblings sit.
#[tokio::test]
async fn test_xml_sibling_separation() {
let config = ExtractionConfig::default();
let xml = b"AB";
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
// Both siblings are present.
assert_eq!(
result.content.matches("PLANT").count(),
2,
"expected two PLANT siblings, got: {:?}",
result.content
);
// A blank line appears somewhere between the two PLANT labels.
let parts: Vec<&str> = result.content.split("PLANT").collect();
assert!(parts.len() >= 3, "expected >=2 PLANT splits, got: {parts:?}");
assert!(
parts[1].contains("\n\n"),
"expected blank line between sibling PLANT entries, got: {:?}",
parts[1]
);
}
/// Namespace attributes (xmlns:*) should be filtered from output.
#[tokio::test]
async fn test_xml_namespace_filtering() {
let config = ExtractionConfig::default();
let xml = br#"- Text
"#;
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(!result.content.contains("xmlns"), "Namespace attrs should be filtered");
assert!(
result.content.contains("root (id: 1)"),
"Non-namespace attrs should be preserved"
);
assert!(result.content.contains("Text"));
}
/// Mixed content (text between elements) should be preserved with indentation.
#[tokio::test]
async fn test_xml_mixed_content() {
let config = ExtractionConfig::default();
let xml = b"Text before- nested
Text after";
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(result.content.contains("Text before"));
assert!(result.content.contains("nested"));
assert!(result.content.contains("Text after"));
}
/// Self-closing tags should appear in the output.
#[tokio::test]
async fn test_xml_self_closing_tags() {
let config = ExtractionConfig::default();
let xml = br#" "#;
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(result.content.contains("item (type: empty)"));
}
/// Empty attribute values should be filtered from the label.
#[tokio::test]
async fn test_xml_empty_attribute_filtered() {
let config = ExtractionConfig::default();
let xml = br#"- Text
"#;
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(result.content.contains("item (type: book)"));
assert!(!result.content.contains("id:"), "Empty attribute should be filtered");
}
/// Text directly inside the root element should still be indented.
#[tokio::test]
async fn test_xml_root_level_text() {
let config = ExtractionConfig::default();
let xml = b"Some text";
let result = extract_bytes(xml, "application/xml", &config).await.unwrap();
assert!(result.content.contains("root"));
assert!(result.content.contains("Some text"));
}
/// Real XML file should produce grouped plant entries: each plant's COMMON
/// label and its value should appear close together under the plant's label.
#[tokio::test]
async fn test_xml_real_file_plant_catalog() {
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/xml/plant_catalog.xml");
if !path.exists() {
return;
}
let content = std::fs::read(&path).unwrap();
let config = ExtractionConfig::default();
let result = extract_bytes(&content, "application/xml", &config).await.unwrap();
// Both plants are present and their COMMON values appear within the same
// grouped block — assertion is loose on exact indentation so that future
// tweaks to the indent step or blank-line policy don't break it.
for plant in ["Bloodroot", "Columbine"] {
let pos_plant = result.content.rfind("PLANT").expect("missing PLANT label");
let pos_value = result
.content
.find(plant)
.unwrap_or_else(|| panic!("missing plant value: {plant}"));
// Either a PLANT label precedes this value, or COMMON appears between
// them — both suffice for "grouped together".
let between = &result.content[..pos_value];
assert!(
between.contains("PLANT") && between.rfind("COMMON").map(|c| c < pos_value).unwrap_or(false),
"expected `PLANT ... COMMON` to precede {plant}; got {pos_plant}, {pos_value}"
);
}
}