crates/kreuzberg/tests/error_handling.rs

//! Error handling and edge case integration tests.
//!
//! Tests for corrupted files, edge cases, and invalid inputs.
//! Validates that the system handles errors gracefully without panics.

use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::{extract_bytes, extract_file};
use std::io::Write;
use tempfile::NamedTempFile;

mod helpers;

/// Test truncated PDF - incomplete PDF file.
#[tokio::test]
#[cfg(feature = "pdf")]
async fn test_truncated_pdf() {
    let config = ExtractionConfig::default();

    let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";

    let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;

    assert!(result.is_err(), "Truncated PDF should fail gracefully");

    let error = result.unwrap_err();
    assert!(
        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
        "Truncated PDF should produce Parsing error, got: {:?}",
        error
    );
}

/// Test corrupted ZIP - malformed archive.
#[tokio::test]
#[cfg(feature = "archives")]
async fn test_corrupted_zip() {
    let config = ExtractionConfig::default();

    let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];

    let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;

    assert!(result.is_err(), "Corrupted ZIP should fail gracefully");

    let error = result.unwrap_err();
    assert!(
        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
        "Corrupted ZIP should produce Parsing error, got: {:?}",
        error
    );
}

/// Test invalid XML - bad XML syntax.
#[tokio::test]
#[cfg(feature = "xml")]
async fn test_invalid_xml() {
    let config = ExtractionConfig::default();

    let invalid_xml = b"<?xml version=\"1.0\"?>\n\
<root>\n\
<unclosed>\n\
<another>text</wrong_tag>\n\
</root";

    let result = extract_bytes(invalid_xml, "application/xml", &config).await;

    match result {
        Ok(extraction) => {
            assert!(
                extraction.chunks.is_none(),
                "Chunks should be None without chunking config"
            );
        }
        Err(error) => {
            assert!(
                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
                "Invalid XML error should be Parsing type, got: {:?}",
                error
            );
        }
    }
}

/// Test corrupted image - invalid image data.
#[tokio::test]
#[cfg(feature = "ocr")]
async fn test_corrupted_image() {
    let config = ExtractionConfig::default();

    let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];

    let result = extract_bytes(&corrupted_png, "image/png", &config).await;

    match result {
        Ok(extraction) => {
            assert!(
                extraction.chunks.is_none(),
                "Chunks should be None without chunking config"
            );
        }
        Err(error) => {
            assert!(
                matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
                    || matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
                "Corrupted image error should be Parsing or OCR type, got: {:?}",
                error
            );
        }
    }
}

/// Test empty file - 0 bytes.
#[tokio::test]
async fn test_empty_file() {
    let config = ExtractionConfig::default();

    let empty_data = b"";

    let result_text = extract_bytes(empty_data, "text/plain", &config).await;
    #[cfg(feature = "pdf")]
    {
        let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
        match result_pdf {
            Ok(extraction) => {
                assert!(
                    extraction.content.is_empty(),
                    "Empty PDF should have empty content if it succeeds"
                );
                assert!(extraction.chunks.is_none(), "Chunks should be None");
            }
            Err(error) => {
                assert!(
                    matches!(
                        error,
                        kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
                    ),
                    "Empty PDF should produce Parsing or Validation error, got: {:?}",
                    error
                );
            }
        }
    }

    match result_text {
        Ok(extraction) => {
            assert!(
                extraction.content.is_empty(),
                "Empty text file should have empty content"
            );
            assert!(extraction.chunks.is_none(), "Chunks should be None");
        }
        Err(error) => {
            panic!("Empty text file should not fail, got error: {:?}", error);
        }
    }

    #[cfg(feature = "xml")]
    {
        let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
        match result_xml {
            Ok(extraction) => {
                assert!(
                    extraction.content.is_empty(),
                    "Empty XML should have empty content if it succeeds"
                );
                assert!(extraction.chunks.is_none(), "Chunks should be None");
            }
            Err(error) => {
                assert!(
                    matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
                    "Empty XML error should be Parsing type, got: {:?}",
                    error
                );
            }
        }
    }
}

/// Test very large file - stress test with large content.
#[tokio::test]
async fn test_very_large_file() {
    let config = ExtractionConfig::default();

    let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
    let large_bytes = large_text.as_bytes();

    let result = extract_bytes(large_bytes, "text/plain", &config).await;

    assert!(result.is_ok(), "Large file should be processed successfully");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Large file content should not be empty");
    assert!(extraction.content.len() > 1_000_000, "Content should be large");
    assert!(
        extraction.chunks.is_none(),
        "Chunks should be None without chunking config"
    );
    assert!(
        extraction.detected_languages.is_none(),
        "Language detection not enabled"
    );
    assert!(extraction.tables.is_empty(), "Text file should not have tables");

    assert!(
        extraction.content.contains("This is a line of text"),
        "Content should preserve original text"
    );
}

/// Test unicode filenames - non-ASCII paths.
#[tokio::test]
async fn test_unicode_filenames() {
    let config = ExtractionConfig::default();

    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
    temp_file
        .write_all(b"Test content with Unicode filename.")
        .expect("Operation failed");

    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;

    assert!(result.is_ok(), "Unicode filename should be handled");
    let extraction = result.expect("Operation failed");

    assert!(
        extraction.content.contains("Test content"),
        "Content should be extracted"
    );
    assert!(
        extraction.chunks.is_none(),
        "Chunks should be None without chunking config"
    );
    assert!(
        extraction.detected_languages.is_none(),
        "Language detection not enabled"
    );
}

/// Test special characters in content - emojis, RTL text.
#[tokio::test]
async fn test_special_characters_content() {
    let config = ExtractionConfig::default();

    let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
Arabic (RTL): مرحبا بالعالم\n\
Chinese: 你好世界\n\
Japanese: こんにちは世界\n\
Special chars: © ® ™ € £ ¥\n\
Math symbols: ∑ ∫ √ ≈ ∞";

    let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;

    assert!(result.is_ok(), "Special characters should be handled");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");
    assert!(extraction.content.len() > 10, "Should have substantial content");
    assert!(
        extraction.chunks.is_none(),
        "Chunks should be None without chunking config"
    );
    assert!(
        extraction.detected_languages.is_none(),
        "Language detection not enabled"
    );

    assert!(
        extraction.content.contains("Emojis")
            || extraction.content.contains("Arabic")
            || extraction.content.contains("Chinese"),
        "Should preserve at least some special character text"
    );
}

/// Test nonexistent file - file not found.
#[tokio::test]
async fn test_nonexistent_file() {
    let config = ExtractionConfig::default();

    let nonexistent_path = "/nonexistent/path/to/file.pdf";

    let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;

    assert!(result.is_err(), "Nonexistent file should return error");

    let error = result.unwrap_err();
    assert!(
        matches!(error, kreuzberg::KreuzbergError::Io(_))
            || matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
        "Should be IO or Validation error for nonexistent file, got: {:?}",
        error
    );
}

/// Test unsupported format - unknown file type.
#[tokio::test]
async fn test_unsupported_format() {
    let config = ExtractionConfig::default();

    let data = b"Some random data";

    let result = extract_bytes(data, "application/x-unknown-format", &config).await;

    assert!(result.is_err(), "Unsupported format should return error");

    let error = result.unwrap_err();
    assert!(
        matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
        "Should be UnsupportedFormat error, got: {:?}",
        error
    );
}

/// Test permission denied - no read access (platform-specific).
#[tokio::test]
#[cfg(unix)]
async fn test_permission_denied() {
    use std::fs;
    use std::os::unix::fs::PermissionsExt;

    let config = ExtractionConfig::default();

    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
    temp_file.write_all(b"Test content").expect("Operation failed");

    let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
    perms.set_mode(0o000);
    fs::set_permissions(temp_file.path(), perms).expect("Operation failed");

    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;

    let mut perms = fs::metadata(temp_file.path()).expect("Operation failed").permissions();
    perms.set_mode(0o644);
    fs::set_permissions(temp_file.path(), perms).expect("Operation failed");

    assert!(result.is_err(), "Permission denied should return error");
}

/// Test file extension mismatch - .pdf extension with DOCX content.
#[tokio::test]
async fn test_file_extension_mismatch() {
    let config = ExtractionConfig::default();

    let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];

    let result = extract_bytes(&docx_magic, "application/pdf", &config).await;

    assert!(result.is_err(), "MIME type mismatch should fail");
}

/// Test extraction with null bytes in content.
#[tokio::test]
async fn test_null_bytes_in_content() {
    let config = ExtractionConfig::default();

    let data_with_nulls = b"Text before\x00null\x00bytes\x00after";

    let result = extract_bytes(data_with_nulls, "text/plain", &config).await;

    assert!(result.is_ok(), "Null bytes should be handled");
    let extraction = result.expect("Operation failed");

    assert!(!extraction.content.is_empty(), "Content should not be empty");
    assert!(
        extraction.chunks.is_none(),
        "Chunks should be None without chunking config"
    );

    assert!(
        extraction.content.contains("Text before") || extraction.content.contains("after"),
        "Should preserve at least some of the text content"
    );
}

/// Test concurrent extractions of same file.
#[tokio::test]
async fn test_concurrent_extractions() {
    let config = ExtractionConfig::default();

    let text_data = b"Concurrent extraction test content.";

    let handles: Vec<_> = (0..10)
        .map(|_| {
            let config = config.clone();
            tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
        })
        .collect();

    for handle in handles {
        let result = handle.await.expect("Task should complete");
        assert!(result.is_ok(), "Concurrent extraction should succeed");

        let extraction = result.expect("Operation failed");
        assert!(
            extraction.content.contains("Concurrent extraction"),
            "Content should be extracted correctly"
        );
        assert!(extraction.chunks.is_none(), "Chunks should be None");
        assert!(
            extraction.detected_languages.is_none(),
            "Language detection not enabled"
        );
    }
}