//! Shared test helpers for integration tests. //! //! This module provides common utilities for loading test files, //! making assertions, and setting up test environments. #![allow(dead_code)] use kreuzberg::types::ExtractionResult; use std::path::PathBuf; /// Get the test_documents directory path. /// /// This assumes the test is running from the workspace root. pub fn get_test_documents_dir() -> PathBuf { let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .parent() .unwrap() .parent() .unwrap() .to_path_buf(); workspace_root.join("test_documents") } /// Get the full path to a test file. /// /// # Arguments /// /// * `relative_path` - Path relative to test_documents/ pub fn get_test_file_path(relative_path: &str) -> PathBuf { get_test_documents_dir().join(relative_path) } /// Assert that extraction result contains non-empty content. /// /// This is a common assertion for most extraction tests - we want /// to verify that *something* was extracted, even if we don't know /// the exact content. pub fn assert_non_empty_content(result: &ExtractionResult) { assert!( !result.content.trim().is_empty(), "Extraction result should have non-empty content, got: '{}'", result.content ); } /// Assert that extraction result has expected MIME type. pub fn assert_mime_type(result: &ExtractionResult, expected: &str) { assert_eq!( result.mime_type, expected, "Expected MIME type '{}', got '{}'", expected, result.mime_type ); } /// Skip test if file doesn't exist (for optional test files). /// /// Returns true if test should be skipped. pub fn skip_if_missing(relative_path: &str) -> bool { let path = get_test_file_path(relative_path); if !path.exists() { tracing::debug!("Skipping test: file not found at {}", path.display()); return true; } false } /// Check if test documents directory exists and has files. /// /// This is useful for CI environments where test_documents might /// be a git submodule that hasn't been initialized. pub fn test_documents_available() -> bool { let dir = get_test_documents_dir(); dir.exists() && dir.read_dir().map(|mut d| d.next().is_some()).unwrap_or(false) } /// Assert that content length is above a minimum threshold. /// /// This is useful for smoke testing - ensuring substantial content /// was extracted without needing to verify exact text. pub fn assert_min_content_length(result: &ExtractionResult, min_length: usize) { assert!( result.content.len() >= min_length, "Expected content length >= {}, got {}. Content preview: '{}'", min_length, result.content.len(), result.content.chars().take(200).collect::() ); } /// Assert that content contains at least one of the given substrings. pub fn assert_content_contains_any(result: &ExtractionResult, substrings: &[&str]) { let found = substrings.iter().any(|s| result.content.contains(s)); assert!( found, "Expected content to contain at least one of {:?}, but found none", substrings ); } /// Assert that extraction result has at least one table. pub fn assert_has_tables(result: &ExtractionResult) { assert!( !result.tables.is_empty(), "Expected result to have tables, but found none" ); } /// Create a test configuration with OCR enabled. pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig { use kreuzberg::core::config::{ExtractionConfig, OcrConfig}; ExtractionConfig { ocr: Some(OcrConfig { backend: "tesseract".to_string(), language: "eng".to_string(), ..Default::default() }), force_ocr: false, ..Default::default() } } // PDF-specific test helpers (only available with pdf feature) #[cfg(feature = "pdf")] pub mod pdf_helpers { use kreuzberg::core::config::ExtractionConfig; use kreuzberg::pdf::hierarchy::BoundingBox; /// Create a bounding box with simple coordinates. pub fn create_bounding_box(left: f32, top: f32, right: f32, bottom: f32) -> BoundingBox { BoundingBox { left, top, right, bottom, } } /// Create a default extraction configuration for testing hierarchy extraction. /// /// # Returns /// /// A new ExtractionConfig with PDF hierarchy options enabled pub fn create_hierarchy_extraction_config() -> ExtractionConfig { ExtractionConfig::default() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_get_test_documents_dir() { let dir = get_test_documents_dir(); assert!(dir.to_string_lossy().ends_with("test_documents")); } #[test] fn test_test_documents_available() { let available = test_documents_available(); if !available { tracing::debug!("Warning: test_documents directory not available"); tracing::debug!("This is expected in CI without git submodules initialized"); } } }