309 lines
10 KiB
Rust
309 lines
10 KiB
Rust
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
|
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
|
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
|
//! rewritten against the public extraction surface.
|
|
|
|
#![cfg(any())]
|
|
|
|
// Original content preserved below; recompiled once gating cfg drops.
|
|
// Disabled by the file-level cfg(any()) above.
|
|
|
|
/*
|
|
#![cfg(feature = "api")]
|
|
//! Diagnostic tests for large PDF file extraction issues.
|
|
//!
|
|
//! These tests are designed to isolate and identify the root cause of
|
|
//! issues with large PDF file handling in the Kreuzberg API server.
|
|
//!
|
|
//! Current Status:
|
|
//! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200
|
|
//! - This suggests either:
|
|
//! a) The mock PDF structure is invalid
|
|
//! b) The PDF extraction logic has issues with the generated content
|
|
//! c) The multipart parsing is failing on large payloads
|
|
//!
|
|
//! These diagnostic tests help narrow down which component is failing.
|
|
|
|
use axum::{
|
|
body::{Body, to_bytes},
|
|
http::{Request, StatusCode},
|
|
};
|
|
use kreuzberg::{
|
|
ExtractionConfig,
|
|
api::{ApiSizeLimits, create_router_with_limits},
|
|
};
|
|
use serde_json::Value;
|
|
use tower::ServiceExt;
|
|
|
|
/// Test extracting a minimal valid PDF (control test).
|
|
///
|
|
/// This serves as a baseline to verify the API can handle valid PDFs
|
|
/// before testing with large files.
|
|
#[tokio::test]
|
|
async fn test_extract_minimal_valid_pdf() {
|
|
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
|
|
let pdf_content = b"%PDF-1.4
|
|
1 0 obj
|
|
<< /Type /Catalog /Pages 2 0 R >>
|
|
endobj
|
|
2 0 obj
|
|
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
|
endobj
|
|
3 0 obj
|
|
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
|
endobj
|
|
4 0 obj
|
|
<< >>
|
|
stream
|
|
BT /F1 12 Tf 50 750 Td (Hello) Tj ET
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 5
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000074 00000 n
|
|
0000000133 00000 n
|
|
0000000214 00000 n
|
|
trailer
|
|
<< /Size 5 /Root 1 0 R >>
|
|
startxref
|
|
340
|
|
%%EOF";
|
|
|
|
let boundary = "----minimal-pdf";
|
|
let mut body = Vec::new();
|
|
|
|
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n");
|
|
body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
|
|
body.extend_from_slice(pdf_content);
|
|
body.extend_from_slice(b"\r\n");
|
|
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
|
|
let request = Request::builder()
|
|
.method("POST")
|
|
.uri("/extract")
|
|
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
.header("content-length", body.len())
|
|
.body(Body::from(body))
|
|
.expect("Failed to build request");
|
|
|
|
let response = router.oneshot(request).await.expect("Request failed");
|
|
|
|
assert_eq!(
|
|
response.status(),
|
|
StatusCode::OK,
|
|
"Minimal PDF should extract successfully. Status: {} indicates baseline is working",
|
|
response.status()
|
|
);
|
|
|
|
let body = to_bytes(response.into_body(), 1_000_000)
|
|
.await
|
|
.expect("Failed to read response body");
|
|
|
|
let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
|
|
eprintln!(
|
|
"Extraction result: {}",
|
|
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
);
|
|
}
|
|
|
|
/// Test extracting a 1MB text file (control test without PDF).
|
|
///
|
|
/// This isolates whether the issue is specific to PDF handling or
|
|
/// a general problem with large multipart uploads.
|
|
#[tokio::test]
|
|
async fn test_extract_1mb_text_file() {
|
|
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
|
|
let boundary = "----large-text";
|
|
let large_text = "This is test content. ".repeat(50000);
|
|
|
|
let mut body = Vec::new();
|
|
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n");
|
|
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
body.extend_from_slice(large_text.as_bytes());
|
|
body.extend_from_slice(b"\r\n");
|
|
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
|
|
let request = Request::builder()
|
|
.method("POST")
|
|
.uri("/extract")
|
|
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
.header("content-length", body.len())
|
|
.body(Body::from(body))
|
|
.expect("Failed to build request");
|
|
|
|
let response = router.oneshot(request).await.expect("Request failed");
|
|
|
|
println!("1MB text file extraction status: {}", response.status());
|
|
|
|
assert_eq!(
|
|
response.status(),
|
|
StatusCode::OK,
|
|
"1MB text file should extract successfully. If this fails, multipart parsing may have issues."
|
|
);
|
|
}
|
|
|
|
/// Test extracting progressively larger text files to find breaking point.
|
|
///
|
|
/// This helps identify at what size the API starts failing.
|
|
#[tokio::test]
|
|
async fn test_find_size_breaking_point() {
|
|
let sizes = vec![
|
|
("100KB", 100 * 1024),
|
|
("500KB", 500 * 1024),
|
|
("1MB", 1024 * 1024),
|
|
("2MB", 2 * 1024 * 1024),
|
|
("5MB", 5 * 1024 * 1024),
|
|
];
|
|
|
|
for (label, size) in sizes {
|
|
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20));
|
|
|
|
let boundary = "----size-test";
|
|
let content = "A".repeat(size);
|
|
|
|
let mut body = Vec::new();
|
|
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
body.extend_from_slice(
|
|
format!(
|
|
"Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n",
|
|
label
|
|
)
|
|
.as_bytes(),
|
|
);
|
|
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
body.extend_from_slice(content.as_bytes());
|
|
body.extend_from_slice(b"\r\n");
|
|
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
|
|
let request = Request::builder()
|
|
.method("POST")
|
|
.uri("/extract")
|
|
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
.header("content-length", body.len())
|
|
.body(Body::from(body))
|
|
.expect("Failed to build request");
|
|
|
|
let response = router.oneshot(request).await.expect("Request failed");
|
|
|
|
println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16());
|
|
|
|
if response.status() != StatusCode::OK {
|
|
eprintln!("Extraction failed at size: {}", label);
|
|
|
|
let body = to_bytes(response.into_body(), 1_000_000)
|
|
.await
|
|
.expect("Failed to read response body");
|
|
|
|
if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
|
|
eprintln!(
|
|
"Error response: {}",
|
|
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
);
|
|
} else {
|
|
eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test that the default 100MB limit is being applied.
|
|
///
|
|
/// Verifies that the server is actually respecting the configured limits,
|
|
/// and documents what the default limit actually is.
|
|
#[tokio::test]
|
|
async fn test_default_size_limits() {
|
|
let default_limits = ApiSizeLimits::default();
|
|
assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024);
|
|
assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024);
|
|
|
|
println!(
|
|
"Default limits: {} bytes request, {} bytes per field",
|
|
default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes
|
|
);
|
|
}
|
|
|
|
/// Test that the router layer actually applies RequestBodyLimitLayer.
|
|
///
|
|
/// Creates a router and verifies that size limit enforcement is active.
|
|
#[tokio::test]
|
|
async fn test_request_body_limit_layer_applied() {
|
|
let small_limits = ApiSizeLimits::from_mb(1, 1);
|
|
let router = create_router_with_limits(ExtractionConfig::default(), small_limits);
|
|
|
|
let boundary = "----exceed-limits";
|
|
let large_content = "X".repeat(2 * 1024 * 1024);
|
|
|
|
let mut body = Vec::new();
|
|
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
|
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n");
|
|
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
|
body.extend_from_slice(large_content.as_bytes());
|
|
body.extend_from_slice(b"\r\n");
|
|
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
|
|
|
let request = Request::builder()
|
|
.method("POST")
|
|
.uri("/extract")
|
|
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
|
.header("content-length", body.len())
|
|
.body(Body::from(body))
|
|
.expect("Failed to build request");
|
|
|
|
let response = router.oneshot(request).await.expect("Request failed");
|
|
|
|
assert_eq!(
|
|
response.status(),
|
|
StatusCode::PAYLOAD_TOO_LARGE,
|
|
"2MB file should be rejected when limit is 1MB"
|
|
);
|
|
}
|
|
|
|
/// Test multipart parsing with incremental content.
|
|
///
|
|
/// Some implementations have issues with streaming multipart parsing.
|
|
/// This test uses proper CRLF line endings to ensure correct parsing.
|
|
#[tokio::test]
|
|
async fn test_multipart_proper_crlf_formatting() {
|
|
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
|
|
|
let content = "Test PDF content that is at least somewhat large for testing purposes.";
|
|
|
|
let mut body = Vec::new();
|
|
|
|
body.extend_from_slice(b"--BOUNDARY123456\r\n");
|
|
|
|
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n");
|
|
body.extend_from_slice(b"Content-Type: application/pdf\r\n");
|
|
|
|
body.extend_from_slice(b"\r\n");
|
|
|
|
body.extend_from_slice(content.as_bytes());
|
|
|
|
body.extend_from_slice(b"\r\n");
|
|
|
|
body.extend_from_slice(b"--BOUNDARY123456--\r\n");
|
|
|
|
let request = Request::builder()
|
|
.method("POST")
|
|
.uri("/extract")
|
|
.header("content-type", "multipart/form-data; boundary=BOUNDARY123456")
|
|
.header("content-length", body.len())
|
|
.body(Body::from(body))
|
|
.expect("Failed to build request");
|
|
|
|
let response = router.oneshot(request).await.expect("Request failed");
|
|
|
|
println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16());
|
|
assert!(response.status().is_success() || response.status().is_client_error());
|
|
}
|
|
|
|
*/
|