Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,168 @@
//! Shared test helpers for integration tests.
//!
//! This module provides common utilities for loading test files,
//! making assertions, and setting up test environments.
#![allow(dead_code)]
use kreuzberg::types::ExtractionResult;
use std::path::PathBuf;
/// Get the test_documents directory path.
///
/// This assumes the test is running from the workspace root.
pub fn get_test_documents_dir() -> PathBuf {
let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.to_path_buf();
workspace_root.join("test_documents")
}
/// Get the full path to a test file.
///
/// # Arguments
///
/// * `relative_path` - Path relative to test_documents/
pub fn get_test_file_path(relative_path: &str) -> PathBuf {
get_test_documents_dir().join(relative_path)
}
/// Assert that extraction result contains non-empty content.
///
/// This is a common assertion for most extraction tests - we want
/// to verify that *something* was extracted, even if we don't know
/// the exact content.
pub fn assert_non_empty_content(result: &ExtractionResult) {
assert!(
!result.content.trim().is_empty(),
"Extraction result should have non-empty content, got: '{}'",
result.content
);
}
/// Assert that extraction result has expected MIME type.
pub fn assert_mime_type(result: &ExtractionResult, expected: &str) {
assert_eq!(
result.mime_type, expected,
"Expected MIME type '{}', got '{}'",
expected, result.mime_type
);
}
/// Skip test if file doesn't exist (for optional test files).
///
/// Returns true if test should be skipped.
pub fn skip_if_missing(relative_path: &str) -> bool {
let path = get_test_file_path(relative_path);
if !path.exists() {
tracing::debug!("Skipping test: file not found at {}", path.display());
return true;
}
false
}
/// Check if test documents directory exists and has files.
///
/// This is useful for CI environments where test_documents might
/// be a git submodule that hasn't been initialized.
pub fn test_documents_available() -> bool {
let dir = get_test_documents_dir();
dir.exists() && dir.read_dir().map(|mut d| d.next().is_some()).unwrap_or(false)
}
/// Assert that content length is above a minimum threshold.
///
/// This is useful for smoke testing - ensuring substantial content
/// was extracted without needing to verify exact text.
pub fn assert_min_content_length(result: &ExtractionResult, min_length: usize) {
assert!(
result.content.len() >= min_length,
"Expected content length >= {}, got {}. Content preview: '{}'",
min_length,
result.content.len(),
result.content.chars().take(200).collect::<String>()
);
}
/// Assert that content contains at least one of the given substrings.
pub fn assert_content_contains_any(result: &ExtractionResult, substrings: &[&str]) {
let found = substrings.iter().any(|s| result.content.contains(s));
assert!(
found,
"Expected content to contain at least one of {:?}, but found none",
substrings
);
}
/// Assert that extraction result has at least one table.
pub fn assert_has_tables(result: &ExtractionResult) {
assert!(
!result.tables.is_empty(),
"Expected result to have tables, but found none"
);
}
/// Create a test configuration with OCR enabled.
pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
force_ocr: false,
..Default::default()
}
}
// PDF-specific test helpers (only available with pdf feature)
#[cfg(feature = "pdf")]
pub mod pdf_helpers {
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::pdf::hierarchy::BoundingBox;
/// Create a bounding box with simple coordinates.
pub fn create_bounding_box(left: f32, top: f32, right: f32, bottom: f32) -> BoundingBox {
BoundingBox {
left,
top,
right,
bottom,
}
}
/// Create a default extraction configuration for testing hierarchy extraction.
///
/// # Returns
///
/// A new ExtractionConfig with PDF hierarchy options enabled
pub fn create_hierarchy_extraction_config() -> ExtractionConfig {
ExtractionConfig::default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_test_documents_dir() {
let dir = get_test_documents_dir();
assert!(dir.to_string_lossy().ends_with("test_documents"));
}
#[test]
fn test_test_documents_available() {
let available = test_documents_available();
if !available {
tracing::debug!("Warning: test_documents directory not available");
tracing::debug!("This is expected in CI without git submodules initialized");
}
}
}