349 lines
11 KiB
Rust
349 lines
11 KiB
Rust
|
|
//! Integration tests for the /chunk API endpoint.
|
||
|
|
|
||
|
|
#![cfg(feature = "api")]
|
||
|
|
|
||
|
|
use axum::{
|
||
|
|
body::Body,
|
||
|
|
http::{Request, StatusCode},
|
||
|
|
};
|
||
|
|
use serde_json::json;
|
||
|
|
use tower::ServiceExt;
|
||
|
|
|
||
|
|
use kreuzberg::{ExtractionConfig, api::create_router};
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_basic() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "Short text. More text here. Even more content to chunk."
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_empty_text_returns_400() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(json!({"text": ""}).to_string()))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_markdown_strategy() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "# Heading\n\nParagraph text here.",
|
||
|
|
"chunker_type": "markdown"
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_response_structure() {
|
||
|
|
use kreuzberg::api::ChunkResponse;
|
||
|
|
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
|
||
|
|
"config": {
|
||
|
|
"max_characters": 50,
|
||
|
|
"overlap": 10,
|
||
|
|
"trim": true
|
||
|
|
},
|
||
|
|
"chunker_type": "text"
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
|
||
|
|
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||
|
|
.await
|
||
|
|
.expect("Failed to convert to bytes");
|
||
|
|
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Verify response structure
|
||
|
|
assert!(chunk_response.chunk_count > 0);
|
||
|
|
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
||
|
|
assert_eq!(chunk_response.chunker_type, "text");
|
||
|
|
assert_eq!(chunk_response.config.max_characters, 50);
|
||
|
|
assert_eq!(chunk_response.config.overlap, 10);
|
||
|
|
assert!(chunk_response.config.trim);
|
||
|
|
assert!(chunk_response.input_size_bytes > 0);
|
||
|
|
|
||
|
|
// Verify chunk metadata
|
||
|
|
for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
|
||
|
|
assert!(!chunk.content.is_empty());
|
||
|
|
assert_eq!(chunk.chunk_index, idx);
|
||
|
|
assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
|
||
|
|
assert!(chunk.byte_end > chunk.byte_start);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_invalid_strategy_returns_400() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "Test text",
|
||
|
|
"chunker_type": "invalid_type"
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_with_defaults() {
|
||
|
|
use kreuzberg::api::ChunkResponse;
|
||
|
|
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "This is a test sentence. Another sentence here."
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
|
||
|
|
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||
|
|
.await
|
||
|
|
.expect("Failed to convert to bytes");
|
||
|
|
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Verify defaults are applied
|
||
|
|
assert_eq!(chunk_response.config.max_characters, 2000);
|
||
|
|
assert_eq!(chunk_response.config.overlap, 100);
|
||
|
|
assert!(chunk_response.config.trim);
|
||
|
|
assert_eq!(chunk_response.chunker_type, "text");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_malformed_json_returns_400() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from("{invalid json}"))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_case_insensitive_chunker_type() {
|
||
|
|
use kreuzberg::api::ChunkResponse;
|
||
|
|
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "# Title\n\nContent here.",
|
||
|
|
"chunker_type": "MARKDOWN"
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
|
||
|
|
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||
|
|
.await
|
||
|
|
.expect("Failed to convert to bytes");
|
||
|
|
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Verify it's normalized to lowercase
|
||
|
|
assert_eq!(chunk_response.chunker_type, "markdown");
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_long_text() {
|
||
|
|
use kreuzberg::api::ChunkResponse;
|
||
|
|
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
|
||
|
|
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": long_text,
|
||
|
|
"config": {
|
||
|
|
"max_characters": 500,
|
||
|
|
"overlap": 50
|
||
|
|
}
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
|
||
|
|
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||
|
|
.await
|
||
|
|
.expect("Failed to convert to bytes");
|
||
|
|
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Should have multiple chunks
|
||
|
|
assert!(chunk_response.chunk_count > 1);
|
||
|
|
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_custom_config() {
|
||
|
|
use kreuzberg::api::ChunkResponse;
|
||
|
|
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(
|
||
|
|
json!({
|
||
|
|
"text": "Test sentence one. Test sentence two. Test sentence three.",
|
||
|
|
"config": {
|
||
|
|
"max_characters": 30,
|
||
|
|
"overlap": 5,
|
||
|
|
"trim": false
|
||
|
|
},
|
||
|
|
"chunker_type": "text"
|
||
|
|
})
|
||
|
|
.to_string(),
|
||
|
|
))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
assert_eq!(response.status(), StatusCode::OK);
|
||
|
|
|
||
|
|
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||
|
|
.await
|
||
|
|
.expect("Failed to convert to bytes");
|
||
|
|
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||
|
|
|
||
|
|
// Verify custom config was applied
|
||
|
|
assert_eq!(chunk_response.config.max_characters, 30);
|
||
|
|
assert_eq!(chunk_response.config.overlap, 5);
|
||
|
|
assert!(!chunk_response.config.trim);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[tokio::test]
|
||
|
|
async fn test_chunk_rejects_json_array() {
|
||
|
|
let app = create_router(ExtractionConfig::default());
|
||
|
|
|
||
|
|
// Send a JSON array instead of object
|
||
|
|
let response = app
|
||
|
|
.oneshot(
|
||
|
|
Request::builder()
|
||
|
|
.uri("/chunk")
|
||
|
|
.method("POST")
|
||
|
|
.header("content-type", "application/json")
|
||
|
|
.body(Body::from(r#"[["text"], {"text": "content"}]"#))
|
||
|
|
.expect("Operation failed"),
|
||
|
|
)
|
||
|
|
.await
|
||
|
|
.expect("Operation failed");
|
||
|
|
|
||
|
|
// Should reject with 400 or 422, NOT 200
|
||
|
|
assert!(
|
||
|
|
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
||
|
|
"Expected 400 or 422, got {}",
|
||
|
|
response.status()
|
||
|
|
);
|
||
|
|
}
|