//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises //! pub(crate) APIs that the migration deliberately narrowed; gated until //! either (a) these APIs are re-exposed publicly, or (b) the test is //! rewritten against the public extraction surface. #![cfg(any())] // Original content preserved below; recompiled once gating cfg drops. // Disabled by the file-level cfg(any()) above. /* //! Live integration tests for liter-llm features. //! //! These tests require real API keys (loaded from `.env` at the workspace root). //! Tests skip gracefully when the required key is not set. //! //! Run with: `cargo test -p kreuzberg --features "liter-llm,embeddings,pdf" --test llm_integration -- --nocapture --test-threads=1` //! //! Use `--test-threads=1` to avoid provider rate limiting when running all tests. #![cfg(feature = "liter-llm")] use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::config::llm::{LlmConfig, StructuredExtractionConfig}; use serde_json::json; /// Skip a test if the named env var is not set or empty. macro_rules! require_env { ($var:expr) => { match std::env::var($var) { Ok(val) if !val.is_empty() => val, _ => { eprintln!("SKIP: {} not set, skipping live integration test", $var); return; } } }; } fn init() { let _ = dotenvy::dotenv(); } fn make_llm_config(model: &str, api_key: String) -> LlmConfig { LlmConfig { model: model.to_string(), api_key: Some(api_key), base_url: None, timeout_secs: Some(120), max_retries: Some(2), temperature: None, max_tokens: None, } } fn memo_schema() -> serde_json::Value { json!({ "type": "object", "properties": { "title": { "type": "string" }, "date": { "type": "string" }, "summary": { "type": "string" } }, "required": ["title", "date", "summary"], "additionalProperties": false }) } async fn extract_memo_text() -> String { kreuzberg::extract_file( "../../test_documents/pdf/fake_memo.pdf", None, &ExtractionConfig::default(), ) .await .expect("Failed to extract fake_memo.pdf") .content } // --------------------------------------------------------------------------- // VLM OCR tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_vlm_ocr_openai() { init(); let api_key = require_env!("OPENAI_API_KEY"); let config = make_llm_config("openai/gpt-4o-mini", api_key); let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap(); let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None) .await .unwrap(); assert!(!result.is_empty(), "VLM OCR returned empty string"); assert!( result.to_lowercase().contains("hello"), "Expected 'hello' in OCR result, got: {result}" ); } #[tokio::test] async fn test_vlm_ocr_anthropic() { init(); let api_key = require_env!("ANTHROPIC_API_KEY"); let config = make_llm_config("anthropic/claude-sonnet-4-20250514", api_key); let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap(); let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None) .await .unwrap(); assert!(!result.is_empty(), "VLM OCR returned empty string"); assert!( result.to_lowercase().contains("hello"), "Expected 'hello' in OCR result, got: {result}" ); } #[tokio::test] async fn test_vlm_ocr_gemini() { init(); let api_key = require_env!("GEMINI_API_KEY"); let config = make_llm_config("gemini/gemini-2.5-flash", api_key); let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap(); let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None) .await .unwrap(); assert!(!result.is_empty(), "VLM OCR returned empty string"); assert!( result.to_lowercase().contains("hello"), "Expected 'hello' in OCR result, got: {result}" ); } // --------------------------------------------------------------------------- // LLM Embedding tests // --------------------------------------------------------------------------- #[cfg(feature = "embeddings")] #[tokio::test] async fn test_llm_embed_openai() { init(); let api_key = require_env!("OPENAI_API_KEY"); let config = kreuzberg::core::config::processing::EmbeddingConfig { model: kreuzberg::core::config::processing::EmbeddingModelType::Llm { llm: make_llm_config("openai/text-embedding-3-small", api_key), }, normalize: true, batch_size: 32, show_download_progress: false, cache_dir: None, acceleration: None, }; let texts = vec!["Hello, world!".to_string(), "Rust is great".to_string()]; let result = kreuzberg::embed_texts_async(texts, &config).await.unwrap(); assert_eq!(result.len(), 2, "Expected 2 embeddings"); assert!(!result[0].is_empty(), "Embedding vector is empty"); assert!( result[0].len() > 100, "Embedding dimension too small: {}", result[0].len() ); } #[cfg(feature = "embeddings")] #[tokio::test] async fn test_llm_embed_mistral() { init(); let api_key = require_env!("MISTRAL_API_KEY"); let config = kreuzberg::core::config::processing::EmbeddingConfig { model: kreuzberg::core::config::processing::EmbeddingModelType::Llm { llm: make_llm_config("mistral/mistral-embed", api_key), }, normalize: true, batch_size: 32, show_download_progress: false, cache_dir: None, acceleration: None, }; let texts = vec!["Hello, world!".to_string()]; let result = kreuzberg::embed_texts_async(texts, &config).await.unwrap(); assert_eq!(result.len(), 1, "Expected 1 embedding"); assert!(!result[0].is_empty(), "Embedding vector is empty"); } // --------------------------------------------------------------------------- // Structured Extraction tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_structured_extraction_openai() { init(); let api_key = require_env!("OPENAI_API_KEY"); let text = extract_memo_text().await; let config = StructuredExtractionConfig { schema: memo_schema(), schema_name: "memo_data".to_string(), schema_description: Some("Extract memo metadata".to_string()), strict: true, prompt: None, llm: make_llm_config("openai/gpt-4o-mini", api_key), }; let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config) .await .unwrap(); assert!(result.is_object(), "Expected JSON object, got: {result}"); assert!( result.get("title").is_some(), "Expected 'title' field in result: {result}" ); } #[tokio::test] async fn test_structured_extraction_anthropic() { init(); let api_key = require_env!("ANTHROPIC_API_KEY"); let text = extract_memo_text().await; let config = StructuredExtractionConfig { schema: memo_schema(), schema_name: "memo_data".to_string(), schema_description: None, strict: false, prompt: None, llm: make_llm_config("anthropic/claude-sonnet-4-20250514", api_key), }; let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config) .await .unwrap(); assert!(result.is_object(), "Expected JSON object"); } #[tokio::test] async fn test_structured_extraction_gemini() { init(); let api_key = require_env!("GEMINI_API_KEY"); let text = extract_memo_text().await; // Schema sanitization now strips additionalProperties automatically for Gemini. let config = StructuredExtractionConfig { schema: memo_schema(), schema_name: "memo_data".to_string(), schema_description: None, strict: false, prompt: None, llm: make_llm_config("gemini/gemini-2.5-flash", api_key), }; let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config) .await .unwrap(); assert!(result.is_object(), "Expected JSON object"); } #[tokio::test] async fn test_structured_extraction_custom_prompt() { init(); let api_key = require_env!("OPENAI_API_KEY"); let text = extract_memo_text().await; let config = StructuredExtractionConfig { schema: json!({ "type": "object", "properties": { "word_count": { "type": "integer" }, "language": { "type": "string" } }, "required": ["word_count", "language"], "additionalProperties": false }), schema_name: "doc_stats".to_string(), schema_description: None, strict: true, prompt: Some( "Analyze this document and return statistics.\n\nDocument:\n{{ content }}\n\nReturn JSON with word_count and language.".to_string() ), llm: make_llm_config("openai/gpt-4o-mini", api_key), }; let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config) .await .unwrap(); assert!(result.is_object(), "Expected JSON object"); assert!(result.get("word_count").is_some(), "Missing word_count"); assert!(result.get("language").is_some(), "Missing language"); } // --------------------------------------------------------------------------- // Full pipeline integration tests // --------------------------------------------------------------------------- #[tokio::test] async fn test_structured_extraction_pipeline() { init(); let api_key = require_env!("OPENAI_API_KEY"); let config = ExtractionConfig { structured_extraction: Some(StructuredExtractionConfig { schema: memo_schema(), schema_name: "memo_data".to_string(), schema_description: None, strict: true, prompt: None, llm: make_llm_config("openai/gpt-4o-mini", api_key), }), ..Default::default() }; let result = kreuzberg::extract_file("../../test_documents/pdf/fake_memo.pdf", None, &config) .await .unwrap(); assert!( result.structured_output.is_some(), "Expected structured_output to be populated" ); let output = result.structured_output.unwrap(); assert!(output.is_object(), "Expected JSON object in structured_output"); } */