This commit is contained in:
308
crates/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs
Normal file
308
crates/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs
Normal file
@@ -0,0 +1,308 @@
|
||||
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
||||
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
||||
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
||||
//! rewritten against the public extraction surface.
|
||||
|
||||
#![cfg(any())]
|
||||
|
||||
// Original content preserved below; recompiled once gating cfg drops.
|
||||
// Disabled by the file-level cfg(any()) above.
|
||||
|
||||
/*
|
||||
#![cfg(feature = "api")]
|
||||
//! Diagnostic tests for large PDF file extraction issues.
|
||||
//!
|
||||
//! These tests are designed to isolate and identify the root cause of
|
||||
//! issues with large PDF file handling in the Kreuzberg API server.
|
||||
//!
|
||||
//! Current Status:
|
||||
//! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200
|
||||
//! - This suggests either:
|
||||
//! a) The mock PDF structure is invalid
|
||||
//! b) The PDF extraction logic has issues with the generated content
|
||||
//! c) The multipart parsing is failing on large payloads
|
||||
//!
|
||||
//! These diagnostic tests help narrow down which component is failing.
|
||||
|
||||
use axum::{
|
||||
body::{Body, to_bytes},
|
||||
http::{Request, StatusCode},
|
||||
};
|
||||
use kreuzberg::{
|
||||
ExtractionConfig,
|
||||
api::{ApiSizeLimits, create_router_with_limits},
|
||||
};
|
||||
use serde_json::Value;
|
||||
use tower::ServiceExt;
|
||||
|
||||
/// Test extracting a minimal valid PDF (control test).
|
||||
///
|
||||
/// This serves as a baseline to verify the API can handle valid PDFs
|
||||
/// before testing with large files.
|
||||
#[tokio::test]
|
||||
async fn test_extract_minimal_valid_pdf() {
|
||||
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
||||
|
||||
let pdf_content = b"%PDF-1.4
|
||||
1 0 obj
|
||||
<< /Type /Catalog /Pages 2 0 R >>
|
||||
endobj
|
||||
2 0 obj
|
||||
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
|
||||
endobj
|
||||
3 0 obj
|
||||
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
|
||||
endobj
|
||||
4 0 obj
|
||||
<< >>
|
||||
stream
|
||||
BT /F1 12 Tf 50 750 Td (Hello) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
xref
|
||||
0 5
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000074 00000 n
|
||||
0000000133 00000 n
|
||||
0000000214 00000 n
|
||||
trailer
|
||||
<< /Size 5 /Root 1 0 R >>
|
||||
startxref
|
||||
340
|
||||
%%EOF";
|
||||
|
||||
let boundary = "----minimal-pdf";
|
||||
let mut body = Vec::new();
|
||||
|
||||
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
||||
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n");
|
||||
body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
|
||||
body.extend_from_slice(pdf_content);
|
||||
body.extend_from_slice(b"\r\n");
|
||||
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
||||
|
||||
let request = Request::builder()
|
||||
.method("POST")
|
||||
.uri("/extract")
|
||||
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
||||
.header("content-length", body.len())
|
||||
.body(Body::from(body))
|
||||
.expect("Failed to build request");
|
||||
|
||||
let response = router.oneshot(request).await.expect("Request failed");
|
||||
|
||||
assert_eq!(
|
||||
response.status(),
|
||||
StatusCode::OK,
|
||||
"Minimal PDF should extract successfully. Status: {} indicates baseline is working",
|
||||
response.status()
|
||||
);
|
||||
|
||||
let body = to_bytes(response.into_body(), 1_000_000)
|
||||
.await
|
||||
.expect("Failed to read response body");
|
||||
|
||||
let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
|
||||
eprintln!(
|
||||
"Extraction result: {}",
|
||||
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
||||
);
|
||||
}
|
||||
|
||||
/// Test extracting a 1MB text file (control test without PDF).
|
||||
///
|
||||
/// This isolates whether the issue is specific to PDF handling or
|
||||
/// a general problem with large multipart uploads.
|
||||
#[tokio::test]
|
||||
async fn test_extract_1mb_text_file() {
|
||||
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
||||
|
||||
let boundary = "----large-text";
|
||||
let large_text = "This is test content. ".repeat(50000);
|
||||
|
||||
let mut body = Vec::new();
|
||||
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
||||
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n");
|
||||
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
||||
body.extend_from_slice(large_text.as_bytes());
|
||||
body.extend_from_slice(b"\r\n");
|
||||
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
||||
|
||||
let request = Request::builder()
|
||||
.method("POST")
|
||||
.uri("/extract")
|
||||
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
||||
.header("content-length", body.len())
|
||||
.body(Body::from(body))
|
||||
.expect("Failed to build request");
|
||||
|
||||
let response = router.oneshot(request).await.expect("Request failed");
|
||||
|
||||
println!("1MB text file extraction status: {}", response.status());
|
||||
|
||||
assert_eq!(
|
||||
response.status(),
|
||||
StatusCode::OK,
|
||||
"1MB text file should extract successfully. If this fails, multipart parsing may have issues."
|
||||
);
|
||||
}
|
||||
|
||||
/// Test extracting progressively larger text files to find breaking point.
|
||||
///
|
||||
/// This helps identify at what size the API starts failing.
|
||||
#[tokio::test]
|
||||
async fn test_find_size_breaking_point() {
|
||||
let sizes = vec![
|
||||
("100KB", 100 * 1024),
|
||||
("500KB", 500 * 1024),
|
||||
("1MB", 1024 * 1024),
|
||||
("2MB", 2 * 1024 * 1024),
|
||||
("5MB", 5 * 1024 * 1024),
|
||||
];
|
||||
|
||||
for (label, size) in sizes {
|
||||
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20));
|
||||
|
||||
let boundary = "----size-test";
|
||||
let content = "A".repeat(size);
|
||||
|
||||
let mut body = Vec::new();
|
||||
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
||||
body.extend_from_slice(
|
||||
format!(
|
||||
"Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n",
|
||||
label
|
||||
)
|
||||
.as_bytes(),
|
||||
);
|
||||
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
||||
body.extend_from_slice(content.as_bytes());
|
||||
body.extend_from_slice(b"\r\n");
|
||||
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
||||
|
||||
let request = Request::builder()
|
||||
.method("POST")
|
||||
.uri("/extract")
|
||||
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
||||
.header("content-length", body.len())
|
||||
.body(Body::from(body))
|
||||
.expect("Failed to build request");
|
||||
|
||||
let response = router.oneshot(request).await.expect("Request failed");
|
||||
|
||||
println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16());
|
||||
|
||||
if response.status() != StatusCode::OK {
|
||||
eprintln!("Extraction failed at size: {}", label);
|
||||
|
||||
let body = to_bytes(response.into_body(), 1_000_000)
|
||||
.await
|
||||
.expect("Failed to read response body");
|
||||
|
||||
if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
|
||||
eprintln!(
|
||||
"Error response: {}",
|
||||
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
||||
);
|
||||
} else {
|
||||
eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that the default 100MB limit is being applied.
|
||||
///
|
||||
/// Verifies that the server is actually respecting the configured limits,
|
||||
/// and documents what the default limit actually is.
|
||||
#[tokio::test]
|
||||
async fn test_default_size_limits() {
|
||||
let default_limits = ApiSizeLimits::default();
|
||||
assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024);
|
||||
assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024);
|
||||
|
||||
println!(
|
||||
"Default limits: {} bytes request, {} bytes per field",
|
||||
default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes
|
||||
);
|
||||
}
|
||||
|
||||
/// Test that the router layer actually applies RequestBodyLimitLayer.
|
||||
///
|
||||
/// Creates a router and verifies that size limit enforcement is active.
|
||||
#[tokio::test]
|
||||
async fn test_request_body_limit_layer_applied() {
|
||||
let small_limits = ApiSizeLimits::from_mb(1, 1);
|
||||
let router = create_router_with_limits(ExtractionConfig::default(), small_limits);
|
||||
|
||||
let boundary = "----exceed-limits";
|
||||
let large_content = "X".repeat(2 * 1024 * 1024);
|
||||
|
||||
let mut body = Vec::new();
|
||||
body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
|
||||
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n");
|
||||
body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
|
||||
body.extend_from_slice(large_content.as_bytes());
|
||||
body.extend_from_slice(b"\r\n");
|
||||
body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());
|
||||
|
||||
let request = Request::builder()
|
||||
.method("POST")
|
||||
.uri("/extract")
|
||||
.header("content-type", format!("multipart/form-data; boundary={}", boundary))
|
||||
.header("content-length", body.len())
|
||||
.body(Body::from(body))
|
||||
.expect("Failed to build request");
|
||||
|
||||
let response = router.oneshot(request).await.expect("Request failed");
|
||||
|
||||
assert_eq!(
|
||||
response.status(),
|
||||
StatusCode::PAYLOAD_TOO_LARGE,
|
||||
"2MB file should be rejected when limit is 1MB"
|
||||
);
|
||||
}
|
||||
|
||||
/// Test multipart parsing with incremental content.
|
||||
///
|
||||
/// Some implementations have issues with streaming multipart parsing.
|
||||
/// This test uses proper CRLF line endings to ensure correct parsing.
|
||||
#[tokio::test]
|
||||
async fn test_multipart_proper_crlf_formatting() {
|
||||
let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));
|
||||
|
||||
let content = "Test PDF content that is at least somewhat large for testing purposes.";
|
||||
|
||||
let mut body = Vec::new();
|
||||
|
||||
body.extend_from_slice(b"--BOUNDARY123456\r\n");
|
||||
|
||||
body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n");
|
||||
body.extend_from_slice(b"Content-Type: application/pdf\r\n");
|
||||
|
||||
body.extend_from_slice(b"\r\n");
|
||||
|
||||
body.extend_from_slice(content.as_bytes());
|
||||
|
||||
body.extend_from_slice(b"\r\n");
|
||||
|
||||
body.extend_from_slice(b"--BOUNDARY123456--\r\n");
|
||||
|
||||
let request = Request::builder()
|
||||
.method("POST")
|
||||
.uri("/extract")
|
||||
.header("content-type", "multipart/form-data; boundary=BOUNDARY123456")
|
||||
.header("content-length", body.len())
|
||||
.body(Body::from(body))
|
||||
.expect("Failed to build request");
|
||||
|
||||
let response = router.oneshot(request).await.expect("Request failed");
|
||||
|
||||
println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16());
|
||||
assert!(response.status().is_success() || response.status().is_client_error());
|
||||
}
|
||||
|
||||
*/
|
||||
Reference in New Issue
Block a user