Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/llm_integration.rs
+++ b/crates/kreuzberg/tests/llm_integration.rs
@@ -0,0 +1,310 @@
+//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
+//! pub(crate) APIs that the migration deliberately narrowed; gated until
+//! either (a) these APIs are re-exposed publicly, or (b) the test is
+//! rewritten against the public extraction surface.
+
+#![cfg(any())]
+
+// Original content preserved below; recompiled once gating cfg drops.
+// Disabled by the file-level cfg(any()) above.
+
+/*
+//! Live integration tests for liter-llm features.
+//!
+//! These tests require real API keys (loaded from `.env` at the workspace root).
+//! Tests skip gracefully when the required key is not set.
+//!
+//! Run with: `cargo test -p kreuzberg --features "liter-llm,embeddings,pdf" --test llm_integration -- --nocapture --test-threads=1`
+//!
+//! Use `--test-threads=1` to avoid provider rate limiting when running all tests.
+
+#![cfg(feature = "liter-llm")]
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::config::llm::{LlmConfig, StructuredExtractionConfig};
+use serde_json::json;
+
+/// Skip a test if the named env var is not set or empty.
+macro_rules! require_env {
+    ($var:expr) => {
+        match std::env::var($var) {
+            Ok(val) if !val.is_empty() => val,
+            _ => {
+                eprintln!("SKIP: {} not set, skipping live integration test", $var);
+                return;
+            }
+        }
+    };
+}
+
+fn init() {
+    let _ = dotenvy::dotenv();
+}
+
+fn make_llm_config(model: &str, api_key: String) -> LlmConfig {
+    LlmConfig {
+        model: model.to_string(),
+        api_key: Some(api_key),
+        base_url: None,
+        timeout_secs: Some(120),
+        max_retries: Some(2),
+        temperature: None,
+        max_tokens: None,
+    }
+}
+
+fn memo_schema() -> serde_json::Value {
+    json!({
+        "type": "object",
+        "properties": {
+            "title": { "type": "string" },
+            "date": { "type": "string" },
+            "summary": { "type": "string" }
+        },
+        "required": ["title", "date", "summary"],
+        "additionalProperties": false
+    })
+}
+
+async fn extract_memo_text() -> String {
+    kreuzberg::extract_file(
+        "../../test_documents/pdf/fake_memo.pdf",
+        None,
+        &ExtractionConfig::default(),
+    )
+    .await
+    .expect("Failed to extract fake_memo.pdf")
+    .content
+}
+
+// ---------------------------------------------------------------------------
+// VLM OCR tests
+// ---------------------------------------------------------------------------
+
+#[tokio::test]
+async fn test_vlm_ocr_openai() {
+    init();
+    let api_key = require_env!("OPENAI_API_KEY");
+    let config = make_llm_config("openai/gpt-4o-mini", api_key);
+    let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap();
+    let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None)
+        .await
+        .unwrap();
+    assert!(!result.is_empty(), "VLM OCR returned empty string");
+    assert!(
+        result.to_lowercase().contains("hello"),
+        "Expected 'hello' in OCR result, got: {result}"
+    );
+}
+
+#[tokio::test]
+async fn test_vlm_ocr_anthropic() {
+    init();
+    let api_key = require_env!("ANTHROPIC_API_KEY");
+    let config = make_llm_config("anthropic/claude-sonnet-4-20250514", api_key);
+    let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap();
+    let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None)
+        .await
+        .unwrap();
+    assert!(!result.is_empty(), "VLM OCR returned empty string");
+    assert!(
+        result.to_lowercase().contains("hello"),
+        "Expected 'hello' in OCR result, got: {result}"
+    );
+}
+
+#[tokio::test]
+async fn test_vlm_ocr_gemini() {
+    init();
+    let api_key = require_env!("GEMINI_API_KEY");
+    let config = make_llm_config("gemini/gemini-2.5-flash", api_key);
+    let image_bytes = std::fs::read("../../test_documents/images/test_hello_world.png").unwrap();
+    let (result, _usage) = kreuzberg::llm::vlm_ocr::vlm_ocr(&image_bytes, "image/png", "eng", &config, None)
+        .await
+        .unwrap();
+    assert!(!result.is_empty(), "VLM OCR returned empty string");
+    assert!(
+        result.to_lowercase().contains("hello"),
+        "Expected 'hello' in OCR result, got: {result}"
+    );
+}
+
+// ---------------------------------------------------------------------------
+// LLM Embedding tests
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "embeddings")]
+#[tokio::test]
+async fn test_llm_embed_openai() {
+    init();
+    let api_key = require_env!("OPENAI_API_KEY");
+    let config = kreuzberg::core::config::processing::EmbeddingConfig {
+        model: kreuzberg::core::config::processing::EmbeddingModelType::Llm {
+            llm: make_llm_config("openai/text-embedding-3-small", api_key),
+        },
+        normalize: true,
+        batch_size: 32,
+        show_download_progress: false,
+        cache_dir: None,
+        acceleration: None,
+    };
+    let texts = vec!["Hello, world!".to_string(), "Rust is great".to_string()];
+    let result = kreuzberg::embed_texts_async(texts, &config).await.unwrap();
+    assert_eq!(result.len(), 2, "Expected 2 embeddings");
+    assert!(!result[0].is_empty(), "Embedding vector is empty");
+    assert!(
+        result[0].len() > 100,
+        "Embedding dimension too small: {}",
+        result[0].len()
+    );
+}
+
+#[cfg(feature = "embeddings")]
+#[tokio::test]
+async fn test_llm_embed_mistral() {
+    init();
+    let api_key = require_env!("MISTRAL_API_KEY");
+    let config = kreuzberg::core::config::processing::EmbeddingConfig {
+        model: kreuzberg::core::config::processing::EmbeddingModelType::Llm {
+            llm: make_llm_config("mistral/mistral-embed", api_key),
+        },
+        normalize: true,
+        batch_size: 32,
+        show_download_progress: false,
+        cache_dir: None,
+        acceleration: None,
+    };
+    let texts = vec!["Hello, world!".to_string()];
+    let result = kreuzberg::embed_texts_async(texts, &config).await.unwrap();
+    assert_eq!(result.len(), 1, "Expected 1 embedding");
+    assert!(!result[0].is_empty(), "Embedding vector is empty");
+}
+
+// ---------------------------------------------------------------------------
+// Structured Extraction tests
+// ---------------------------------------------------------------------------
+
+#[tokio::test]
+async fn test_structured_extraction_openai() {
+    init();
+    let api_key = require_env!("OPENAI_API_KEY");
+    let text = extract_memo_text().await;
+    let config = StructuredExtractionConfig {
+        schema: memo_schema(),
+        schema_name: "memo_data".to_string(),
+        schema_description: Some("Extract memo metadata".to_string()),
+        strict: true,
+        prompt: None,
+        llm: make_llm_config("openai/gpt-4o-mini", api_key),
+    };
+    let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config)
+        .await
+        .unwrap();
+    assert!(result.is_object(), "Expected JSON object, got: {result}");
+    assert!(
+        result.get("title").is_some(),
+        "Expected 'title' field in result: {result}"
+    );
+}
+
+#[tokio::test]
+async fn test_structured_extraction_anthropic() {
+    init();
+    let api_key = require_env!("ANTHROPIC_API_KEY");
+    let text = extract_memo_text().await;
+    let config = StructuredExtractionConfig {
+        schema: memo_schema(),
+        schema_name: "memo_data".to_string(),
+        schema_description: None,
+        strict: false,
+        prompt: None,
+        llm: make_llm_config("anthropic/claude-sonnet-4-20250514", api_key),
+    };
+    let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config)
+        .await
+        .unwrap();
+    assert!(result.is_object(), "Expected JSON object");
+}
+
+#[tokio::test]
+async fn test_structured_extraction_gemini() {
+    init();
+    let api_key = require_env!("GEMINI_API_KEY");
+    let text = extract_memo_text().await;
+    // Schema sanitization now strips additionalProperties automatically for Gemini.
+    let config = StructuredExtractionConfig {
+        schema: memo_schema(),
+        schema_name: "memo_data".to_string(),
+        schema_description: None,
+        strict: false,
+        prompt: None,
+        llm: make_llm_config("gemini/gemini-2.5-flash", api_key),
+    };
+    let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config)
+        .await
+        .unwrap();
+    assert!(result.is_object(), "Expected JSON object");
+}
+
+#[tokio::test]
+async fn test_structured_extraction_custom_prompt() {
+    init();
+    let api_key = require_env!("OPENAI_API_KEY");
+    let text = extract_memo_text().await;
+    let config = StructuredExtractionConfig {
+        schema: json!({
+            "type": "object",
+            "properties": {
+                "word_count": { "type": "integer" },
+                "language": { "type": "string" }
+            },
+            "required": ["word_count", "language"],
+            "additionalProperties": false
+        }),
+        schema_name: "doc_stats".to_string(),
+        schema_description: None,
+        strict: true,
+        prompt: Some(
+            "Analyze this document and return statistics.\n\nDocument:\n{{ content }}\n\nReturn JSON with word_count and language.".to_string()
+        ),
+        llm: make_llm_config("openai/gpt-4o-mini", api_key),
+    };
+    let (result, _usage) = kreuzberg::llm::structured::extract_structured(&text, &config)
+        .await
+        .unwrap();
+    assert!(result.is_object(), "Expected JSON object");
+    assert!(result.get("word_count").is_some(), "Missing word_count");
+    assert!(result.get("language").is_some(), "Missing language");
+}
+
+// ---------------------------------------------------------------------------
+// Full pipeline integration tests
+// ---------------------------------------------------------------------------
+
+#[tokio::test]
+async fn test_structured_extraction_pipeline() {
+    init();
+    let api_key = require_env!("OPENAI_API_KEY");
+    let config = ExtractionConfig {
+        structured_extraction: Some(StructuredExtractionConfig {
+            schema: memo_schema(),
+            schema_name: "memo_data".to_string(),
+            schema_description: None,
+            strict: true,
+            prompt: None,
+            llm: make_llm_config("openai/gpt-4o-mini", api_key),
+        }),
+        ..Default::default()
+    };
+    let result = kreuzberg::extract_file("../../test_documents/pdf/fake_memo.pdf", None, &config)
+        .await
+        .unwrap();
+    assert!(
+        result.structured_output.is_some(),
+        "Expected structured_output to be populated"
+    );
+    let output = result.structured_output.unwrap();
+    assert!(output.is_object(), "Expected JSON object in structured_output");
+}
+
+*/