//! Integration scaffold for the generated fixture corpus produced by //! `tools/generate_test_fixtures/`. //! //! Each generator under `tools/generate_test_fixtures/src/generate_test_fixtures/` //! writes a binary fixture alongside a `*.gt.json` ground-truth sidecar. This //! module: //! //! 1. Defines `FixtureGt` — a Rust mirror of the Python `GroundTruth` //! dataclass (see `tools/generate_test_fixtures/src/generate_test_fixtures/gt_schema.py`). //! 2. Exposes `load_fixture(name)` which returns `(PathBuf, FixtureGt)` for a //! fixture whose stem (`docx_track_changes_basic`, `xlsx_revisions_basic`, //! …) is given. //! 3. Includes one `#[ignore]`d example test per feature stream that //! demonstrates the load-and-assert shape. Tests are gated behind //! `#[ignore]` because the binary fixtures are not yet checked into the //! `test_documents/` submodule — the harness lands first, and the //! fixtures follow once the user decides where they belong. //! //! To run the example tests locally after generating fixtures: //! //! ```text //! task fixtures:generate //! cargo test --test generated_fixtures -- --ignored //! ``` #![allow(dead_code, missing_docs)] use std::fs; use std::path::{Path, PathBuf}; use serde::Deserialize; /// Mirror of the Python `GroundTruth` dataclass. /// /// The `expectations` field is intentionally typed as `serde_json::Value` — /// the shape varies per feature, and the per-feature assertion helpers below /// pick fields out of it directly. See the Python `gt_schema.py` module for /// the per-feature shape (`revisions_expectation`, `diff_expectation`, /// `security_expectation`). #[derive(Debug, Clone, Deserialize)] pub struct FixtureGt { pub fixture_path: String, pub format: String, pub feature: String, pub expectations: serde_json::Value, pub generated_by: String, } /// Walk upward from `CARGO_MANIFEST_DIR` to find the repository root. /// /// Anchored on the presence of `Cargo.toml` AND `test_documents/`. Falls back /// to `CARGO_MANIFEST_DIR` itself when no marker is found, which makes /// failures show useful paths rather than panicking deep inside the loader. fn repo_root() -> PathBuf { let mut current = PathBuf::from(env!("CARGO_MANIFEST_DIR")); loop { if current.join("Cargo.toml").is_file() && current.join("test_documents").is_dir() { return current; } if !current.pop() { return PathBuf::from(env!("CARGO_MANIFEST_DIR")); } } } /// Root of the generated-fixtures tree. fn generated_root() -> PathBuf { repo_root().join("test_documents").join("generated") } /// Search the generated tree for a fixture whose stem matches `name`. /// /// Returns the `(binary_path, ground_truth)` pair. Panics with a descriptive /// message when either the binary or the sidecar is missing — integration /// tests should `#[ignore]` themselves when the generated tree is empty. pub fn load_fixture(name: &str) -> (PathBuf, FixtureGt) { let root = generated_root(); let sidecar = find_sidecar(&root, name).unwrap_or_else(|| { panic!( "ground-truth sidecar {name}.gt.json not found under {root}\n\ Run `task fixtures:generate` to produce the fixture set.", root = root.display(), ) }); let raw = fs::read_to_string(&sidecar) .unwrap_or_else(|e| panic!("failed to read {}: {e}", sidecar.display())); let gt: FixtureGt = serde_json::from_str(&raw) .unwrap_or_else(|e| panic!("failed to parse {}: {e}", sidecar.display())); let binary = repo_root().join(>.fixture_path); assert!( binary.is_file(), "binary fixture {} declared by {} does not exist", binary.display(), sidecar.display(), ); (binary, gt) } fn find_sidecar(root: &Path, name: &str) -> Option { let needle = format!("{name}.gt.json"); let mut stack = vec![root.to_path_buf()]; while let Some(dir) = stack.pop() { let entries = match fs::read_dir(&dir) { Ok(e) => e, Err(_) => continue, }; for entry in entries.flatten() { let path = entry.path(); if path.is_dir() { stack.push(path); } else if path.file_name().and_then(|s| s.to_str()) == Some(needle.as_str()) { return Some(path); } } } None } /// Returns `true` when the generated-fixtures tree is empty / absent. /// /// The example tests below short-circuit through this so a fresh clone (no /// submodule init, no `task fixtures:generate` run) doesn't fail the suite /// noisily — the `#[ignore]` gate is the primary guard, this is belt + braces. fn generated_tree_present() -> bool { generated_root().is_dir() && fs::read_dir(generated_root()) .map(|mut it| it.next().is_some()) .unwrap_or(false) } // ── Example tests ───────────────────────────────────────────────────────────── // // One per feature stream. Each is `#[ignore]`d until the binary fixtures land // in `test_documents/`. They double as documentation of the intended // integration-test shape. #[test] #[ignore = "requires fixtures from `task fixtures:generate` to be checked into test_documents/"] fn revisions_docx_track_changes_basic_matches_ground_truth() { if !generated_tree_present() { return; } let (fixture_path, gt) = load_fixture("docx_track_changes_basic"); assert_eq!(gt.feature, "revisions"); assert_eq!(gt.format, "docx"); let expected_count = gt.expectations["expected_count"] .as_u64() .expect("expected_count must be present in revisions GT"); assert!(expected_count > 0, "revisions fixture must declare at least one revision"); // When the kreuzberg extractor lands the revisions field on // ExtractionResult, replace the placeholder below with: // // let result = kreuzberg::extract::extract_file_sync(&fixture_path, &cfg)?; // let revisions = result.revisions.expect("DOCX track-changes fixture must yield revisions"); // assert_eq!(revisions.len() as u64, expected_count); let _ = fixture_path; } #[test] #[ignore = "requires fixtures from `task fixtures:generate` to be checked into test_documents/"] fn diff_xlsx_budget_pair_round_trips() { if !generated_tree_present() { return; } let (_v1_path, gt) = load_fixture("xlsx_budget_v1"); assert_eq!(gt.feature, "diff"); let cell_changes = gt.expectations["table_cell_changes"] .as_array() .expect("table_cell_changes must be an array"); assert_eq!(cell_changes.len(), 1, "budget diff must declare exactly one cell change"); let change = &cell_changes[0]; assert_eq!(change["row"].as_u64(), Some(1)); assert_eq!(change["col"].as_u64(), Some(1)); assert_eq!(change["from"].as_str(), Some("100")); assert_eq!(change["to"].as_str(), Some("150")); // When the integration lands: // // let v1 = kreuzberg::extract::extract_file_sync(&v1_path, &cfg)?; // let v2 = kreuzberg::extract::extract_file_sync(&v2_path, &cfg)?; // let diff = kreuzberg::diff::compare(&v1, &v2, &DiffOptions::default()); // assert_eq!(diff.tables_changed.len(), 1); // assert_eq!(diff.tables_changed[0].cell_changes.len(), 1); } #[test] #[ignore = "requires fixtures from `task fixtures:generate` to be checked into test_documents/"] fn security_zip_bomb_pathological_is_rejected() { if !generated_tree_present() { return; } let (_fixture_path, gt) = load_fixture("zip_bomb_xlsx_pathological"); assert_eq!(gt.feature, "security"); assert_eq!( gt.expectations["should_extract"].as_bool(), Some(false), "pathological zip-bomb fixture must declare should_extract=false", ); let warnings = gt.expectations["expected_warnings"] .as_array() .expect("expected_warnings must be an array"); let has_bomb_term = warnings .iter() .filter_map(|v| v.as_str()) .any(|s| s.contains("bomb") || s.contains("zip")); assert!(has_bomb_term, "pathological zip-bomb GT must mention zip/bomb in warnings"); }