//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises //! pub(crate) APIs that the migration deliberately narrowed; gated until //! either (a) these APIs are re-exposed publicly, or (b) the test is //! rewritten against the public extraction surface. #![cfg(any())] // Original content preserved below; recompiled once gating cfg drops. // Disabled by the file-level cfg(any()) above. /* #![cfg(feature = "api")] //! Diagnostic tests for large PDF file extraction issues. //! //! These tests are designed to isolate and identify the root cause of //! issues with large PDF file handling in the Kreuzberg API server. //! //! Current Status: //! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200 //! - This suggests either: //! a) The mock PDF structure is invalid //! b) The PDF extraction logic has issues with the generated content //! c) The multipart parsing is failing on large payloads //! //! These diagnostic tests help narrow down which component is failing. use axum::{ body::{Body, to_bytes}, http::{Request, StatusCode}, }; use kreuzberg::{ ExtractionConfig, api::{ApiSizeLimits, create_router_with_limits}, }; use serde_json::Value; use tower::ServiceExt; /// Test extracting a minimal valid PDF (control test). /// /// This serves as a baseline to verify the API can handle valid PDFs /// before testing with large files. #[tokio::test] async fn test_extract_minimal_valid_pdf() { let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10)); let pdf_content = b"%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >> endobj 4 0 obj << >> stream BT /F1 12 Tf 50 750 Td (Hello) Tj ET endstream endobj xref 0 5 0000000000 65535 f 0000000009 00000 n 0000000074 00000 n 0000000133 00000 n 0000000214 00000 n trailer << /Size 5 /Root 1 0 R >> startxref 340 %%EOF"; let boundary = "----minimal-pdf"; let mut body = Vec::new(); body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n"); body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n"); body.extend_from_slice(pdf_content); body.extend_from_slice(b"\r\n"); body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes()); let request = Request::builder() .method("POST") .uri("/extract") .header("content-type", format!("multipart/form-data; boundary={}", boundary)) .header("content-length", body.len()) .body(Body::from(body)) .expect("Failed to build request"); let response = router.oneshot(request).await.expect("Request failed"); assert_eq!( response.status(), StatusCode::OK, "Minimal PDF should extract successfully. Status: {} indicates baseline is working", response.status() ); let body = to_bytes(response.into_body(), 1_000_000) .await .expect("Failed to read response body"); let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response"); eprintln!( "Extraction result: {}", serde_json::to_string_pretty(&parsed).expect("Failed to parse") ); } /// Test extracting a 1MB text file (control test without PDF). /// /// This isolates whether the issue is specific to PDF handling or /// a general problem with large multipart uploads. #[tokio::test] async fn test_extract_1mb_text_file() { let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10)); let boundary = "----large-text"; let large_text = "This is test content. ".repeat(50000); let mut body = Vec::new(); body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n"); body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n"); body.extend_from_slice(large_text.as_bytes()); body.extend_from_slice(b"\r\n"); body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes()); let request = Request::builder() .method("POST") .uri("/extract") .header("content-type", format!("multipart/form-data; boundary={}", boundary)) .header("content-length", body.len()) .body(Body::from(body)) .expect("Failed to build request"); let response = router.oneshot(request).await.expect("Request failed"); println!("1MB text file extraction status: {}", response.status()); assert_eq!( response.status(), StatusCode::OK, "1MB text file should extract successfully. If this fails, multipart parsing may have issues." ); } /// Test extracting progressively larger text files to find breaking point. /// /// This helps identify at what size the API starts failing. #[tokio::test] async fn test_find_size_breaking_point() { let sizes = vec![ ("100KB", 100 * 1024), ("500KB", 500 * 1024), ("1MB", 1024 * 1024), ("2MB", 2 * 1024 * 1024), ("5MB", 5 * 1024 * 1024), ]; for (label, size) in sizes { let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20)); let boundary = "----size-test"; let content = "A".repeat(size); let mut body = Vec::new(); body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); body.extend_from_slice( format!( "Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n", label ) .as_bytes(), ); body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n"); body.extend_from_slice(content.as_bytes()); body.extend_from_slice(b"\r\n"); body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes()); let request = Request::builder() .method("POST") .uri("/extract") .header("content-type", format!("multipart/form-data; boundary={}", boundary)) .header("content-length", body.len()) .body(Body::from(body)) .expect("Failed to build request"); let response = router.oneshot(request).await.expect("Request failed"); println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16()); if response.status() != StatusCode::OK { eprintln!("Extraction failed at size: {}", label); let body = to_bytes(response.into_body(), 1_000_000) .await .expect("Failed to read response body"); if let Ok(parsed) = serde_json::from_slice::(&body) { eprintln!( "Error response: {}", serde_json::to_string_pretty(&parsed).expect("Failed to parse") ); } else { eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body)); } return; } } } /// Test that the default 100MB limit is being applied. /// /// Verifies that the server is actually respecting the configured limits, /// and documents what the default limit actually is. #[tokio::test] async fn test_default_size_limits() { let default_limits = ApiSizeLimits::default(); assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024); assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024); println!( "Default limits: {} bytes request, {} bytes per field", default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes ); } /// Test that the router layer actually applies RequestBodyLimitLayer. /// /// Creates a router and verifies that size limit enforcement is active. #[tokio::test] async fn test_request_body_limit_layer_applied() { let small_limits = ApiSizeLimits::from_mb(1, 1); let router = create_router_with_limits(ExtractionConfig::default(), small_limits); let boundary = "----exceed-limits"; let large_content = "X".repeat(2 * 1024 * 1024); let mut body = Vec::new(); body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n"); body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n"); body.extend_from_slice(large_content.as_bytes()); body.extend_from_slice(b"\r\n"); body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes()); let request = Request::builder() .method("POST") .uri("/extract") .header("content-type", format!("multipart/form-data; boundary={}", boundary)) .header("content-length", body.len()) .body(Body::from(body)) .expect("Failed to build request"); let response = router.oneshot(request).await.expect("Request failed"); assert_eq!( response.status(), StatusCode::PAYLOAD_TOO_LARGE, "2MB file should be rejected when limit is 1MB" ); } /// Test multipart parsing with incremental content. /// /// Some implementations have issues with streaming multipart parsing. /// This test uses proper CRLF line endings to ensure correct parsing. #[tokio::test] async fn test_multipart_proper_crlf_formatting() { let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10)); let content = "Test PDF content that is at least somewhat large for testing purposes."; let mut body = Vec::new(); body.extend_from_slice(b"--BOUNDARY123456\r\n"); body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n"); body.extend_from_slice(b"Content-Type: application/pdf\r\n"); body.extend_from_slice(b"\r\n"); body.extend_from_slice(content.as_bytes()); body.extend_from_slice(b"\r\n"); body.extend_from_slice(b"--BOUNDARY123456--\r\n"); let request = Request::builder() .method("POST") .uri("/extract") .header("content-type", "multipart/form-data; boundary=BOUNDARY123456") .header("content-length", body.len()) .body(Body::from(body)) .expect("Failed to build request"); let response = router.oneshot(request).await.expect("Request failed"); println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16()); assert!(response.status().is_success() || response.status().is_client_error()); } */