This commit is contained in:
348
crates/kreuzberg/tests/api_chunk.rs
Normal file
348
crates/kreuzberg/tests/api_chunk.rs
Normal file
@@ -0,0 +1,348 @@
|
||||
//! Integration tests for the /chunk API endpoint.
|
||||
|
||||
#![cfg(feature = "api")]
|
||||
|
||||
use axum::{
|
||||
body::Body,
|
||||
http::{Request, StatusCode},
|
||||
};
|
||||
use serde_json::json;
|
||||
use tower::ServiceExt;
|
||||
|
||||
use kreuzberg::{ExtractionConfig, api::create_router};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_basic() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "Short text. More text here. Even more content to chunk."
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_empty_text_returns_400() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(json!({"text": ""}).to_string()))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_markdown_strategy() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "# Heading\n\nParagraph text here.",
|
||||
"chunker_type": "markdown"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_response_structure() {
|
||||
use kreuzberg::api::ChunkResponse;
|
||||
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
|
||||
"config": {
|
||||
"max_characters": 50,
|
||||
"overlap": 10,
|
||||
"trim": true
|
||||
},
|
||||
"chunker_type": "text"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||||
.await
|
||||
.expect("Failed to convert to bytes");
|
||||
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||||
|
||||
// Verify response structure
|
||||
assert!(chunk_response.chunk_count > 0);
|
||||
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
||||
assert_eq!(chunk_response.chunker_type, "text");
|
||||
assert_eq!(chunk_response.config.max_characters, 50);
|
||||
assert_eq!(chunk_response.config.overlap, 10);
|
||||
assert!(chunk_response.config.trim);
|
||||
assert!(chunk_response.input_size_bytes > 0);
|
||||
|
||||
// Verify chunk metadata
|
||||
for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
|
||||
assert!(!chunk.content.is_empty());
|
||||
assert_eq!(chunk.chunk_index, idx);
|
||||
assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
|
||||
assert!(chunk.byte_end > chunk.byte_start);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_invalid_strategy_returns_400() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "Test text",
|
||||
"chunker_type": "invalid_type"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_with_defaults() {
|
||||
use kreuzberg::api::ChunkResponse;
|
||||
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "This is a test sentence. Another sentence here."
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||||
.await
|
||||
.expect("Failed to convert to bytes");
|
||||
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||||
|
||||
// Verify defaults are applied
|
||||
assert_eq!(chunk_response.config.max_characters, 2000);
|
||||
assert_eq!(chunk_response.config.overlap, 100);
|
||||
assert!(chunk_response.config.trim);
|
||||
assert_eq!(chunk_response.chunker_type, "text");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_malformed_json_returns_400() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from("{invalid json}"))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_case_insensitive_chunker_type() {
|
||||
use kreuzberg::api::ChunkResponse;
|
||||
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "# Title\n\nContent here.",
|
||||
"chunker_type": "MARKDOWN"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||||
.await
|
||||
.expect("Failed to convert to bytes");
|
||||
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||||
|
||||
// Verify it's normalized to lowercase
|
||||
assert_eq!(chunk_response.chunker_type, "markdown");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_long_text() {
|
||||
use kreuzberg::api::ChunkResponse;
|
||||
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
|
||||
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": long_text,
|
||||
"config": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50
|
||||
}
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||||
.await
|
||||
.expect("Failed to convert to bytes");
|
||||
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||||
|
||||
// Should have multiple chunks
|
||||
assert!(chunk_response.chunk_count > 1);
|
||||
assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_custom_config() {
|
||||
use kreuzberg::api::ChunkResponse;
|
||||
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(
|
||||
json!({
|
||||
"text": "Test sentence one. Test sentence two. Test sentence three.",
|
||||
"config": {
|
||||
"max_characters": 30,
|
||||
"overlap": 5,
|
||||
"trim": false
|
||||
},
|
||||
"chunker_type": "text"
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::OK);
|
||||
|
||||
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
||||
.await
|
||||
.expect("Failed to convert to bytes");
|
||||
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
||||
|
||||
// Verify custom config was applied
|
||||
assert_eq!(chunk_response.config.max_characters, 30);
|
||||
assert_eq!(chunk_response.config.overlap, 5);
|
||||
assert!(!chunk_response.config.trim);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_chunk_rejects_json_array() {
|
||||
let app = create_router(ExtractionConfig::default());
|
||||
|
||||
// Send a JSON array instead of object
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/chunk")
|
||||
.method("POST")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(r#"[["text"], {"text": "content"}]"#))
|
||||
.expect("Operation failed"),
|
||||
)
|
||||
.await
|
||||
.expect("Operation failed");
|
||||
|
||||
// Should reject with 400 or 422, NOT 200
|
||||
assert!(
|
||||
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
||||
"Expected 400 or 422, got {}",
|
||||
response.status()
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user