Files
fil/crates/kreuzberg-cli/tests/extract_envelope.rs

238 lines
7.5 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
//! `kreuzberg batch`.
//!
//! Verifies:
//! - `extract --format json` emits `{ result, extraction_time_ms }` shape
//! - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
//! - `result.metadata.ocr_used` exists as a bool field
//! - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
use std::path::{Path, PathBuf};
use std::process::Command;
/// Returns path to the compiled `kreuzberg` binary (debug build).
fn kreuzberg_bin() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("target")
.join("debug")
.join("kreuzberg")
}
/// Returns path to the small reference PDF used in these tests.
fn pdf_fixture() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("test_documents")
.join("pdf")
.join("pdfa_001.pdf")
}
/// Returns path to the small plain-text fixture used for batch tests.
fn txt_fixture() -> PathBuf {
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest_dir
.parent()
.expect("crates/kreuzberg-cli parent")
.parent()
.expect("crates parent")
.join("test_documents")
.join("text")
.join("fake_text.txt")
}
/// Build the binary once before running. Panics on failure.
fn build_binary() {
let status = Command::new("cargo")
.args(["build", "--bin", "kreuzberg"])
.status()
.expect("cargo build invocation failed");
assert!(status.success(), "cargo build failed — binary unavailable");
}
/// Skip-guard: returns `true` when the fixture exists so the test can run.
fn fixture_exists(path: &Path) -> bool {
path.exists() && path.is_file()
}
// ── extract --format json envelope ──────────────────────────────────────────
#[test]
fn test_extract_json_has_result_and_timing() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args(["extract", &pdf.to_string_lossy(), "--format", "json"])
.output()
.expect("failed to run kreuzberg extract");
assert!(
output.status.success(),
"extract exited non-zero: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
// Envelope shape
assert!(json.get("result").is_some(), "missing 'result' key in envelope");
let extraction_time_ms = json
.get("extraction_time_ms")
.and_then(|v| v.as_f64())
.expect("'extraction_time_ms' must be a number");
assert!(
extraction_time_ms > 0.0,
"extraction_time_ms must be positive, got {extraction_time_ms}"
);
// ocr_used field must exist as a bool
let ocr_used = json["result"]["metadata"]
.get("ocr_used")
.expect("'result.metadata.ocr_used' must be present")
.as_bool()
.expect("'result.metadata.ocr_used' must be a boolean");
// For a native-text PDF without --force-ocr, OCR should NOT have run.
assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
}
// ── batch --format json envelope ─────────────────────────────────────────────
#[test]
fn test_batch_json_has_results_and_timing() {
build_binary();
let pdf = pdf_fixture();
let txt = txt_fixture();
if !fixture_exists(&pdf) || !fixture_exists(&txt) {
eprintln!("SKIP: one or more batch fixtures not found");
return;
}
let output = Command::new(kreuzberg_bin())
.args([
"batch",
&pdf.to_string_lossy(),
&txt.to_string_lossy(),
"--format",
"json",
])
.output()
.expect("failed to run kreuzberg batch");
assert!(
output.status.success(),
"batch exited non-zero: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
// Envelope shape
let results = json
.get("results")
.and_then(|v| v.as_array())
.expect("'results' must be an array");
assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
let total_ms = json
.get("total_ms")
.and_then(|v| v.as_f64())
.expect("'total_ms' must be a number");
assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
let per_file_ms = json
.get("per_file_ms")
.and_then(|v| v.as_array())
.expect("'per_file_ms' must be an array");
assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
for (i, timing) in per_file_ms.iter().enumerate() {
let ms = timing.as_f64().expect("per_file_ms entry must be a number");
assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
}
// Each result must have metadata.ocr_used as a bool
for (i, result) in results.iter().enumerate() {
assert!(
result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
"results[{i}].metadata.ocr_used must be a bool"
);
}
}
// ── --pdf-backend validation ─────────────────────────────────────────────────
#[test]
fn test_pdf_backend_invalid_value_exits_nonzero() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
.output()
.expect("failed to run kreuzberg extract");
assert!(
!output.status.success(),
"expected non-zero exit for unknown --pdf-backend"
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("pdf-oxide"),
"error message should mention 'pdf-oxide', got: {stderr}"
);
}
#[test]
fn test_pdf_backend_valid_value_succeeds() {
build_binary();
let pdf = pdf_fixture();
if !fixture_exists(&pdf) {
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
return;
}
let output = Command::new(kreuzberg_bin())
.args([
"extract",
&pdf.to_string_lossy(),
"--pdf-backend",
"pdf-oxide",
"--format",
"json",
])
.output()
.expect("failed to run kreuzberg extract");
assert!(
output.status.success(),
"--pdf-backend pdf-oxide should succeed: {}",
String::from_utf8_lossy(&output.stderr)
);
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
assert!(json.get("result").is_some(), "missing 'result' key");
assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
}