This commit is contained in:
671
crates/kreuzberg/tests/config_features.rs
Normal file
671
crates/kreuzberg/tests/config_features.rs
Normal file
@@ -0,0 +1,671 @@
|
||||
//! Configuration features integration tests.
|
||||
//!
|
||||
//! Tests for chunking, language detection, caching, token reduction, and quality processing.
|
||||
//! Validates that configuration options work correctly end-to-end.
|
||||
|
||||
#[cfg(feature = "chunking")]
|
||||
use kreuzberg::core::config::ChunkingConfig;
|
||||
use kreuzberg::core::config::ExtractionConfig;
|
||||
#[cfg(feature = "language-detection")]
|
||||
use kreuzberg::core::config::LanguageDetectionConfig;
|
||||
use kreuzberg::core::config::TokenReductionOptions;
|
||||
use kreuzberg::core::extractor::extract_bytes;
|
||||
|
||||
mod helpers;
|
||||
|
||||
/// Test chunking enabled - text split into chunks.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "chunking")]
|
||||
async fn test_chunking_enabled() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 50,
|
||||
overlap: 10,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is a long text that should be split into multiple chunks. ".repeat(10);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Operation failed");
|
||||
assert!(chunks.len() > 1, "Should have multiple chunks");
|
||||
|
||||
for chunk in &chunks {
|
||||
assert!(!chunk.content.is_empty(), "Chunk should not be empty");
|
||||
assert!(
|
||||
chunk.content.len() <= 50 + 10,
|
||||
"Chunk length {} exceeds max_chars + overlap",
|
||||
chunk.content.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test chunking with overlap - overlap preserved between chunks.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "chunking")]
|
||||
async fn test_chunking_with_overlap() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 100,
|
||||
overlap: 20,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "a".repeat(250);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Operation failed");
|
||||
assert!(chunks.len() >= 2, "Should have at least 2 chunks");
|
||||
|
||||
if chunks.len() >= 2 {
|
||||
let chunk1 = &chunks[0];
|
||||
let chunk2 = &chunks[1];
|
||||
|
||||
let chunk1_end = &chunk1.content[chunk1.content.len().saturating_sub(20)..];
|
||||
assert!(
|
||||
chunk2.content.starts_with(chunk1_end)
|
||||
|| chunk1_end.starts_with(&chunk2.content[..chunk1_end.len().min(chunk2.content.len())]),
|
||||
"Chunks should have overlap"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test chunking with custom sizes - custom chunk size and overlap.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "chunking")]
|
||||
async fn test_chunking_custom_sizes() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 200,
|
||||
overlap: 50,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Custom chunk test. ".repeat(50);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Operation failed");
|
||||
assert!(!chunks.is_empty(), "Should have at least 1 chunk");
|
||||
|
||||
for chunk in &chunks {
|
||||
assert!(
|
||||
chunk.content.len() <= 200 + 50,
|
||||
"Chunk length {} exceeds custom max_chars + overlap",
|
||||
chunk.content.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test chunking disabled - no chunking when disabled.
|
||||
#[tokio::test]
|
||||
async fn test_chunking_disabled() {
|
||||
let config = ExtractionConfig {
|
||||
chunking: None,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is a long text that should NOT be split into chunks. ".repeat(10);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_none(), "Should not have chunks when chunking disabled");
|
||||
|
||||
assert!(!result.content.is_empty(), "Content should be extracted");
|
||||
assert!(result.content.contains("long text"), "Should contain original text");
|
||||
}
|
||||
|
||||
/// Test language detection for single language document.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "language-detection")]
|
||||
async fn test_language_detection_single() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Hello world! This is English text. It should be detected as English language.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.detected_languages.is_some(), "Should detect language");
|
||||
let languages = result.detected_languages.expect("Operation failed");
|
||||
assert!(!languages.is_empty(), "Should detect at least one language");
|
||||
assert_eq!(languages[0], "eng", "Should detect English");
|
||||
}
|
||||
|
||||
/// Test language detection for multi-language document.
|
||||
#[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "language-detection")]
|
||||
async fn test_language_detection_multiple() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.7,
|
||||
detect_multiple: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Hello world! This is English. ".repeat(10) + "Hola mundo! Este es español. ".repeat(10).as_str();
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.detected_languages.is_some(), "Should detect languages");
|
||||
let languages = result.detected_languages.expect("Operation failed");
|
||||
assert!(!languages.is_empty(), "Should detect at least one language");
|
||||
}
|
||||
|
||||
/// Test language detection with confidence threshold.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "language-detection")]
|
||||
async fn test_language_detection_confidence() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: true,
|
||||
min_confidence: 0.9,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is clear English text that should have high confidence.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
if let Some(languages) = result.detected_languages {
|
||||
assert!(!languages.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
/// Test language detection disabled.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "language-detection")]
|
||||
async fn test_language_detection_disabled() {
|
||||
let config = ExtractionConfig {
|
||||
language_detection: Some(LanguageDetectionConfig {
|
||||
enabled: false,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Hello world! This is English text.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(
|
||||
result.detected_languages.is_none(),
|
||||
"Should not detect language when disabled"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test cache hit behavior - second extraction from cache.
|
||||
#[tokio::test]
|
||||
async fn test_cache_hit_behavior() {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Test text for caching behavior.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result1 = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("First extraction should succeed");
|
||||
|
||||
let result2 = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Second extraction should succeed");
|
||||
|
||||
assert_eq!(result1.content, result2.content);
|
||||
}
|
||||
|
||||
/// Test cache miss and invalidation.
|
||||
#[tokio::test]
|
||||
async fn test_cache_miss_invalidation() {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text1 = "First text for cache test.";
|
||||
let text2 = "Second different text.";
|
||||
|
||||
let result1 = extract_bytes(text1.as_bytes(), "text/plain", &config)
|
||||
.await
|
||||
.expect("First extraction should succeed");
|
||||
|
||||
let result2 = extract_bytes(text2.as_bytes(), "text/plain", &config)
|
||||
.await
|
||||
.expect("Second extraction should succeed");
|
||||
|
||||
assert_ne!(result1.content, result2.content);
|
||||
}
|
||||
|
||||
/// Test custom cache directory (Note: OCR cache uses hardcoded directory).
|
||||
#[tokio::test]
|
||||
async fn test_custom_cache_directory() {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Test text for cache directory test.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(!result.content.is_empty());
|
||||
}
|
||||
|
||||
/// Test cache disabled - bypass cache.
|
||||
#[tokio::test]
|
||||
async fn test_cache_disabled() {
|
||||
let config = ExtractionConfig {
|
||||
use_cache: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Test text without caching.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result1 = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("First extraction should succeed");
|
||||
|
||||
let result2 = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Second extraction should succeed");
|
||||
|
||||
assert_eq!(result1.content, result2.content);
|
||||
}
|
||||
|
||||
/// Test token reduction in aggressive mode.
|
||||
#[tokio::test]
|
||||
async fn test_token_reduction_aggressive() {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionOptions {
|
||||
mode: "aggressive".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is a very long sentence with many unnecessary words that could be reduced. ".repeat(5);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(!result.content.is_empty());
|
||||
}
|
||||
|
||||
/// Test token reduction in conservative mode.
|
||||
#[tokio::test]
|
||||
async fn test_token_reduction_conservative() {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionOptions {
|
||||
mode: "light".to_string(),
|
||||
preserve_important_words: true,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Conservative token reduction test with moderate text length.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(!result.content.is_empty());
|
||||
}
|
||||
|
||||
/// Test token reduction disabled.
|
||||
#[tokio::test]
|
||||
async fn test_token_reduction_disabled() {
|
||||
let config = ExtractionConfig {
|
||||
token_reduction: Some(TokenReductionOptions {
|
||||
mode: "off".to_string(),
|
||||
preserve_important_words: false,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Text without token reduction applied.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.content.contains("without token reduction"));
|
||||
}
|
||||
|
||||
/// Test quality processing enabled - quality scoring applied.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "quality")]
|
||||
async fn test_quality_processing_enabled() {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is well-structured text. It has multiple sentences. And proper punctuation.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
if let Some(score_value) = result.quality_score {
|
||||
assert!((0.0..=1.0).contains(&score_value));
|
||||
}
|
||||
|
||||
assert!(!result.content.is_empty());
|
||||
}
|
||||
|
||||
/// Test quality processing calculates score for different text quality.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "quality")]
|
||||
async fn test_quality_threshold_filtering() {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let high_quality = "This is a well-structured document. It has proper sentences. And good formatting.";
|
||||
let result_high = extract_bytes(high_quality.as_bytes(), "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
let low_quality = "a b c d ....... word123mixed . . ";
|
||||
let result_low = extract_bytes(low_quality.as_bytes(), "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result_high.quality_score.is_some(), "High quality should have score");
|
||||
assert!(result_low.quality_score.is_some(), "Low quality should have score");
|
||||
|
||||
let score_high = result_high.quality_score.expect("High quality should have score");
|
||||
let score_low = result_low.quality_score.expect("Low quality should have score");
|
||||
|
||||
assert!((0.0..=1.0).contains(&score_high));
|
||||
assert!((0.0..=1.0).contains(&score_low));
|
||||
}
|
||||
|
||||
/// Test quality processing disabled.
|
||||
#[tokio::test]
|
||||
async fn test_quality_processing_disabled() {
|
||||
let config = ExtractionConfig {
|
||||
enable_quality_processing: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Text without quality processing.";
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.quality_score.is_none());
|
||||
assert!(!result.content.is_empty());
|
||||
}
|
||||
|
||||
/// Test markdown chunker populates heading context.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "chunking")]
|
||||
async fn test_markdown_chunker_heading_context() {
|
||||
let markdown = r#"# Title
|
||||
|
||||
Some intro text.
|
||||
|
||||
## Section One
|
||||
|
||||
Content in section one with enough text to create a chunk.
|
||||
|
||||
## Section Two
|
||||
|
||||
Content in section two with enough text to create another chunk.
|
||||
|
||||
### Subsection
|
||||
|
||||
More detailed content here in the subsection.
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 80,
|
||||
overlap: 10,
|
||||
chunker_type: kreuzberg::ChunkerType::Markdown,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Should have chunks");
|
||||
assert!(chunks.len() >= 2, "Should have at least 2 chunks");
|
||||
|
||||
// At least one chunk should have heading_context populated
|
||||
let has_heading = chunks.iter().any(|c| c.metadata.heading_context.is_some());
|
||||
assert!(has_heading, "At least one chunk should have heading_context");
|
||||
|
||||
// Verify heading context structure
|
||||
for chunk in &chunks {
|
||||
if let Some(ref ctx) = chunk.metadata.heading_context {
|
||||
for heading in &ctx.headings {
|
||||
assert!(heading.level >= 1 && heading.level <= 6, "Heading level should be 1-6");
|
||||
assert!(!heading.text.is_empty(), "Heading text should not be empty");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that chunk_type is populated for markdown chunks.
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "chunking")]
|
||||
async fn test_chunk_type_populated() {
|
||||
let markdown = r#"# Introduction
|
||||
|
||||
This section introduces the document with some content.
|
||||
|
||||
## Code Example
|
||||
|
||||
```rust
|
||||
fn hello() {
|
||||
println!("Hello, world!");
|
||||
}
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
A brief summary of the document.
|
||||
"#;
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 200,
|
||||
overlap: 0,
|
||||
chunker_type: kreuzberg::ChunkerType::Markdown,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let result = extract_bytes(markdown.as_bytes(), "text/markdown", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Should have chunks");
|
||||
assert!(!chunks.is_empty(), "Should have at least one chunk");
|
||||
|
||||
// Verify chunk metadata is populated
|
||||
for chunk in &chunks {
|
||||
assert!(!chunk.content.is_empty(), "Chunk should not be empty");
|
||||
// chunk_type must always be set (never uninitialized)
|
||||
let _ = &chunk.chunk_type;
|
||||
}
|
||||
|
||||
// At least one chunk should have heading context (from markdown structure)
|
||||
let has_heading_context = chunks.iter().any(|c| c.metadata.heading_context.is_some());
|
||||
assert!(
|
||||
has_heading_context,
|
||||
"At least one chunk should have heading context from markdown structure"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test chunking with embeddings using balanced preset.
|
||||
///
|
||||
/// This test requires ONNX Runtime to be installed as a system dependency.
|
||||
/// On macOS with Homebrew: `brew install onnxruntime`
|
||||
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
||||
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "embeddings")]
|
||||
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
||||
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
||||
async fn test_chunking_with_embeddings() {
|
||||
use kreuzberg::core::config::EmbeddingConfig;
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 100,
|
||||
overlap: 20,
|
||||
embedding: Some(EmbeddingConfig::default()),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "This is a test document for embedding generation. ".repeat(10);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
assert!(result.chunks.is_some(), "Chunks should be present");
|
||||
let chunks = result.chunks.expect("Operation failed");
|
||||
assert!(chunks.len() > 1, "Should have multiple chunks");
|
||||
|
||||
if let Some(error) = result.metadata.additional.get("embedding_error") {
|
||||
panic!("Embedding generation failed: {}", error);
|
||||
}
|
||||
|
||||
for chunk in &chunks {
|
||||
assert!(chunk.embedding.is_some(), "Each chunk should have an embedding");
|
||||
let embedding = chunk.embedding.as_ref().expect("Operation failed");
|
||||
assert_eq!(
|
||||
embedding.len(),
|
||||
768,
|
||||
"Embedding should have 768 dimensions for balanced preset"
|
||||
);
|
||||
|
||||
let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!(
|
||||
(magnitude - 1.0).abs() < 0.01,
|
||||
"Embedding should be normalized (magnitude ~= 1.0)"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test chunking with fast embedding preset.
|
||||
///
|
||||
/// This test requires ONNX Runtime to be installed as a system dependency.
|
||||
/// On macOS with Homebrew: `brew install onnxruntime`
|
||||
/// On Linux: Install via your package manager or download from https://github.com/microsoft/onnxruntime/releases
|
||||
/// On Windows: Download from https://github.com/microsoft/onnxruntime/releases
|
||||
#[tokio::test]
|
||||
#[cfg(feature = "embeddings")]
|
||||
#[cfg_attr(target_os = "macos", ignore = "ONNX models not cached on macOS")]
|
||||
#[cfg_attr(target_os = "windows", ignore = "ONNX models not cached on Windows")]
|
||||
async fn test_chunking_with_fast_embeddings() {
|
||||
use kreuzberg::core::config::{EmbeddingConfig, EmbeddingModelType};
|
||||
|
||||
let config = ExtractionConfig {
|
||||
chunking: Some(ChunkingConfig {
|
||||
max_characters: 100,
|
||||
overlap: 20,
|
||||
embedding: Some(EmbeddingConfig {
|
||||
model: EmbeddingModelType::Preset {
|
||||
name: "fast".to_string(),
|
||||
},
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let text = "Fast embedding test. ".repeat(10);
|
||||
let text_bytes = text.as_bytes();
|
||||
|
||||
let result = extract_bytes(text_bytes, "text/plain", &config)
|
||||
.await
|
||||
.expect("Should extract successfully");
|
||||
|
||||
let chunks = result.chunks.expect("Should have chunks");
|
||||
assert!(!chunks.is_empty(), "Should have at least one chunk");
|
||||
|
||||
if let Some(error) = result.metadata.additional.get("embedding_error") {
|
||||
panic!("Embedding generation failed: {}", error);
|
||||
}
|
||||
|
||||
for chunk in &chunks {
|
||||
let embedding = chunk.embedding.as_ref().expect("Should have embedding");
|
||||
assert_eq!(embedding.len(), 384, "Fast preset should produce 384-dim embeddings");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user