238 lines
7.5 KiB
Rust
238 lines
7.5 KiB
Rust
|
|
//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
|
||
|
|
//! `kreuzberg batch`.
|
||
|
|
//!
|
||
|
|
//! Verifies:
|
||
|
|
//! - `extract --format json` emits `{ result, extraction_time_ms }` shape
|
||
|
|
//! - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
|
||
|
|
//! - `result.metadata.ocr_used` exists as a bool field
|
||
|
|
//! - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
|
||
|
|
|
||
|
|
use std::path::{Path, PathBuf};
|
||
|
|
use std::process::Command;
|
||
|
|
|
||
|
|
/// Returns path to the compiled `kreuzberg` binary (debug build).
|
||
|
|
fn kreuzberg_bin() -> PathBuf {
|
||
|
|
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||
|
|
manifest_dir
|
||
|
|
.parent()
|
||
|
|
.expect("crates/kreuzberg-cli parent")
|
||
|
|
.parent()
|
||
|
|
.expect("crates parent")
|
||
|
|
.join("target")
|
||
|
|
.join("debug")
|
||
|
|
.join("kreuzberg")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Returns path to the small reference PDF used in these tests.
|
||
|
|
fn pdf_fixture() -> PathBuf {
|
||
|
|
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||
|
|
manifest_dir
|
||
|
|
.parent()
|
||
|
|
.expect("crates/kreuzberg-cli parent")
|
||
|
|
.parent()
|
||
|
|
.expect("crates parent")
|
||
|
|
.join("test_documents")
|
||
|
|
.join("pdf")
|
||
|
|
.join("pdfa_001.pdf")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Returns path to the small plain-text fixture used for batch tests.
|
||
|
|
fn txt_fixture() -> PathBuf {
|
||
|
|
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
|
||
|
|
manifest_dir
|
||
|
|
.parent()
|
||
|
|
.expect("crates/kreuzberg-cli parent")
|
||
|
|
.parent()
|
||
|
|
.expect("crates parent")
|
||
|
|
.join("test_documents")
|
||
|
|
.join("text")
|
||
|
|
.join("fake_text.txt")
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Build the binary once before running. Panics on failure.
|
||
|
|
fn build_binary() {
|
||
|
|
let status = Command::new("cargo")
|
||
|
|
.args(["build", "--bin", "kreuzberg"])
|
||
|
|
.status()
|
||
|
|
.expect("cargo build invocation failed");
|
||
|
|
assert!(status.success(), "cargo build failed — binary unavailable");
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Skip-guard: returns `true` when the fixture exists so the test can run.
|
||
|
|
fn fixture_exists(path: &Path) -> bool {
|
||
|
|
path.exists() && path.is_file()
|
||
|
|
}
|
||
|
|
|
||
|
|
// ── extract --format json envelope ──────────────────────────────────────────
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_extract_json_has_result_and_timing() {
|
||
|
|
build_binary();
|
||
|
|
|
||
|
|
let pdf = pdf_fixture();
|
||
|
|
if !fixture_exists(&pdf) {
|
||
|
|
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let output = Command::new(kreuzberg_bin())
|
||
|
|
.args(["extract", &pdf.to_string_lossy(), "--format", "json"])
|
||
|
|
.output()
|
||
|
|
.expect("failed to run kreuzberg extract");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
output.status.success(),
|
||
|
|
"extract exited non-zero: {}",
|
||
|
|
String::from_utf8_lossy(&output.stderr)
|
||
|
|
);
|
||
|
|
|
||
|
|
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||
|
|
|
||
|
|
// Envelope shape
|
||
|
|
assert!(json.get("result").is_some(), "missing 'result' key in envelope");
|
||
|
|
let extraction_time_ms = json
|
||
|
|
.get("extraction_time_ms")
|
||
|
|
.and_then(|v| v.as_f64())
|
||
|
|
.expect("'extraction_time_ms' must be a number");
|
||
|
|
assert!(
|
||
|
|
extraction_time_ms > 0.0,
|
||
|
|
"extraction_time_ms must be positive, got {extraction_time_ms}"
|
||
|
|
);
|
||
|
|
|
||
|
|
// ocr_used field must exist as a bool
|
||
|
|
let ocr_used = json["result"]["metadata"]
|
||
|
|
.get("ocr_used")
|
||
|
|
.expect("'result.metadata.ocr_used' must be present")
|
||
|
|
.as_bool()
|
||
|
|
.expect("'result.metadata.ocr_used' must be a boolean");
|
||
|
|
// For a native-text PDF without --force-ocr, OCR should NOT have run.
|
||
|
|
assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
|
||
|
|
}
|
||
|
|
|
||
|
|
// ── batch --format json envelope ─────────────────────────────────────────────
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_batch_json_has_results_and_timing() {
|
||
|
|
build_binary();
|
||
|
|
|
||
|
|
let pdf = pdf_fixture();
|
||
|
|
let txt = txt_fixture();
|
||
|
|
if !fixture_exists(&pdf) || !fixture_exists(&txt) {
|
||
|
|
eprintln!("SKIP: one or more batch fixtures not found");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let output = Command::new(kreuzberg_bin())
|
||
|
|
.args([
|
||
|
|
"batch",
|
||
|
|
&pdf.to_string_lossy(),
|
||
|
|
&txt.to_string_lossy(),
|
||
|
|
"--format",
|
||
|
|
"json",
|
||
|
|
])
|
||
|
|
.output()
|
||
|
|
.expect("failed to run kreuzberg batch");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
output.status.success(),
|
||
|
|
"batch exited non-zero: {}",
|
||
|
|
String::from_utf8_lossy(&output.stderr)
|
||
|
|
);
|
||
|
|
|
||
|
|
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||
|
|
|
||
|
|
// Envelope shape
|
||
|
|
let results = json
|
||
|
|
.get("results")
|
||
|
|
.and_then(|v| v.as_array())
|
||
|
|
.expect("'results' must be an array");
|
||
|
|
assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
|
||
|
|
|
||
|
|
let total_ms = json
|
||
|
|
.get("total_ms")
|
||
|
|
.and_then(|v| v.as_f64())
|
||
|
|
.expect("'total_ms' must be a number");
|
||
|
|
assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
|
||
|
|
|
||
|
|
let per_file_ms = json
|
||
|
|
.get("per_file_ms")
|
||
|
|
.and_then(|v| v.as_array())
|
||
|
|
.expect("'per_file_ms' must be an array");
|
||
|
|
assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
|
||
|
|
|
||
|
|
for (i, timing) in per_file_ms.iter().enumerate() {
|
||
|
|
let ms = timing.as_f64().expect("per_file_ms entry must be a number");
|
||
|
|
assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
|
||
|
|
}
|
||
|
|
|
||
|
|
// Each result must have metadata.ocr_used as a bool
|
||
|
|
for (i, result) in results.iter().enumerate() {
|
||
|
|
assert!(
|
||
|
|
result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
|
||
|
|
"results[{i}].metadata.ocr_used must be a bool"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ── --pdf-backend validation ─────────────────────────────────────────────────
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_pdf_backend_invalid_value_exits_nonzero() {
|
||
|
|
build_binary();
|
||
|
|
|
||
|
|
let pdf = pdf_fixture();
|
||
|
|
if !fixture_exists(&pdf) {
|
||
|
|
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let output = Command::new(kreuzberg_bin())
|
||
|
|
.args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
|
||
|
|
.output()
|
||
|
|
.expect("failed to run kreuzberg extract");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
!output.status.success(),
|
||
|
|
"expected non-zero exit for unknown --pdf-backend"
|
||
|
|
);
|
||
|
|
|
||
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||
|
|
assert!(
|
||
|
|
stderr.contains("pdf-oxide"),
|
||
|
|
"error message should mention 'pdf-oxide', got: {stderr}"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_pdf_backend_valid_value_succeeds() {
|
||
|
|
build_binary();
|
||
|
|
|
||
|
|
let pdf = pdf_fixture();
|
||
|
|
if !fixture_exists(&pdf) {
|
||
|
|
eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
let output = Command::new(kreuzberg_bin())
|
||
|
|
.args([
|
||
|
|
"extract",
|
||
|
|
&pdf.to_string_lossy(),
|
||
|
|
"--pdf-backend",
|
||
|
|
"pdf-oxide",
|
||
|
|
"--format",
|
||
|
|
"json",
|
||
|
|
])
|
||
|
|
.output()
|
||
|
|
.expect("failed to run kreuzberg extract");
|
||
|
|
|
||
|
|
assert!(
|
||
|
|
output.status.success(),
|
||
|
|
"--pdf-backend pdf-oxide should succeed: {}",
|
||
|
|
String::from_utf8_lossy(&output.stderr)
|
||
|
|
);
|
||
|
|
|
||
|
|
let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
|
||
|
|
assert!(json.get("result").is_some(), "missing 'result' key");
|
||
|
|
assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
|
||
|
|
}
|