fil/crates/kreuzberg/tests/helpers/mod.rs

//! Shared test helpers for integration tests.
//!
//! This module provides common utilities for loading test files,
//! making assertions, and setting up test environments.

#![allow(dead_code)]

use kreuzberg::types::ExtractionResult;
use std::path::PathBuf;

/// Get the test_documents directory path.
///
/// This assumes the test is running from the workspace root.
pub fn get_test_documents_dir() -> PathBuf {
    let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .unwrap()
        .parent()
        .unwrap()
        .to_path_buf();

    workspace_root.join("test_documents")
}

/// Get the full path to a test file.
///
/// # Arguments
///
/// * `relative_path` - Path relative to test_documents/
pub fn get_test_file_path(relative_path: &str) -> PathBuf {
    get_test_documents_dir().join(relative_path)
}

/// Assert that extraction result contains non-empty content.
///
/// This is a common assertion for most extraction tests - we want
/// to verify that *something* was extracted, even if we don't know
/// the exact content.
pub fn assert_non_empty_content(result: &ExtractionResult) {
    assert!(
        !result.content.trim().is_empty(),
        "Extraction result should have non-empty content, got: '{}'",
        result.content
    );
}

/// Assert that extraction result has expected MIME type.
pub fn assert_mime_type(result: &ExtractionResult, expected: &str) {
    assert_eq!(
        result.mime_type, expected,
        "Expected MIME type '{}', got '{}'",
        expected, result.mime_type
    );
}

/// Skip test if file doesn't exist (for optional test files).
///
/// Returns true if test should be skipped.
pub fn skip_if_missing(relative_path: &str) -> bool {
    let path = get_test_file_path(relative_path);
    if !path.exists() {
        tracing::debug!("Skipping test: file not found at {}", path.display());
        return true;
    }
    false
}

/// Check if test documents directory exists and has files.
///
/// This is useful for CI environments where test_documents might
/// be a git submodule that hasn't been initialized.
pub fn test_documents_available() -> bool {
    let dir = get_test_documents_dir();
    dir.exists() && dir.read_dir().map(|mut d| d.next().is_some()).unwrap_or(false)
}

/// Assert that content length is above a minimum threshold.
///
/// This is useful for smoke testing - ensuring substantial content
/// was extracted without needing to verify exact text.
pub fn assert_min_content_length(result: &ExtractionResult, min_length: usize) {
    assert!(
        result.content.len() >= min_length,
        "Expected content length >= {}, got {}. Content preview: '{}'",
        min_length,
        result.content.len(),
        result.content.chars().take(200).collect::<String>()
    );
}

/// Assert that content contains at least one of the given substrings.
pub fn assert_content_contains_any(result: &ExtractionResult, substrings: &[&str]) {
    let found = substrings.iter().any(|s| result.content.contains(s));
    assert!(
        found,
        "Expected content to contain at least one of {:?}, but found none",
        substrings
    );
}

/// Assert that extraction result has at least one table.
pub fn assert_has_tables(result: &ExtractionResult) {
    assert!(
        !result.tables.is_empty(),
        "Expected result to have tables, but found none"
    );
}

/// Create a test configuration with OCR enabled.
pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
    use kreuzberg::core::config::{ExtractionConfig, OcrConfig};

    ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    }
}

// PDF-specific test helpers (only available with pdf feature)
#[cfg(feature = "pdf")]
pub mod pdf_helpers {
    use kreuzberg::core::config::ExtractionConfig;
    use kreuzberg::pdf::hierarchy::BoundingBox;

    /// Create a bounding box with simple coordinates.
    pub fn create_bounding_box(left: f32, top: f32, right: f32, bottom: f32) -> BoundingBox {
        BoundingBox {
            left,
            top,
            right,
            bottom,
        }
    }

    /// Create a default extraction configuration for testing hierarchy extraction.
    ///
    /// # Returns
    ///
    /// A new ExtractionConfig with PDF hierarchy options enabled
    pub fn create_hierarchy_extraction_config() -> ExtractionConfig {
        ExtractionConfig::default()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_get_test_documents_dir() {
        let dir = get_test_documents_dir();
        assert!(dir.to_string_lossy().ends_with("test_documents"));
    }

    #[test]
    fn test_test_documents_available() {
        let available = test_documents_available();
        if !available {
            tracing::debug!("Warning: test_documents directory not available");
            tracing::debug!("This is expected in CI without git submodules initialized");
        }
    }
}