crates/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs

//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
//! pub(crate) APIs that the migration deliberately narrowed; gated until
//! either (a) these APIs are re-exposed publicly, or (b) the test is
//! rewritten against the public extraction surface.

#![cfg(any())]

// Original content preserved below; recompiled once gating cfg drops.
// Disabled by the file-level cfg(any()) above.

/*
#![cfg(feature = "api")]
//! Diagnostic tests for large PDF file extraction issues.
//!
//! These tests are designed to isolate and identify the root cause of
//! issues with large PDF file handling in the Kreuzberg API server.
//!
//! Current Status:
//! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200
//! - This suggests either:
//!   a) The mock PDF structure is invalid
//!   b) The PDF extraction logic has issues with the generated content
//!   c) The multipart parsing is failing on large payloads
//!
//! These diagnostic tests help narrow down which component is failing.

use axum::{
    body::{Body, to_bytes},
    http::{Request, StatusCode},
};
use kreuzberg::{
    ExtractionConfig,
    api::{ApiSizeLimits, create_router_with_limits},
};
use serde_json::Value;
use tower::ServiceExt;

/// Test extracting a minimal valid PDF (control test).
///
/// This serves as a baseline to verify the API can handle valid PDFs
/// before testing with large files.
#[tokio::test]
async fn test_extract_minimal_valid_pdf() {
    let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));

    let pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< >>
stream
BT /F1 12 Tf 50 750 Td (Hello) Tj ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000074 00000 n
0000000133 00000 n
0000000214 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
340
%%EOF";

    let boundary = "----minimal-pdf";
    let mut body = Vec::new();

    body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
    body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n");
    body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");
    body.extend_from_slice(pdf_content);
    body.extend_from_slice(b"\r\n");
    body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());

    let request = Request::builder()
        .method("POST")
        .uri("/extract")
        .header("content-type", format!("multipart/form-data; boundary={}", boundary))
        .header("content-length", body.len())
        .body(Body::from(body))
        .expect("Failed to build request");

    let response = router.oneshot(request).await.expect("Request failed");

    assert_eq!(
        response.status(),
        StatusCode::OK,
        "Minimal PDF should extract successfully. Status: {} indicates baseline is working",
        response.status()
    );

    let body = to_bytes(response.into_body(), 1_000_000)
        .await
        .expect("Failed to read response body");

    let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
    eprintln!(
        "Extraction result: {}",
        serde_json::to_string_pretty(&parsed).expect("Failed to parse")
    );
}

/// Test extracting a 1MB text file (control test without PDF).
///
/// This isolates whether the issue is specific to PDF handling or
/// a general problem with large multipart uploads.
#[tokio::test]
async fn test_extract_1mb_text_file() {
    let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));

    let boundary = "----large-text";
    let large_text = "This is test content. ".repeat(50000);

    let mut body = Vec::new();
    body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
    body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n");
    body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
    body.extend_from_slice(large_text.as_bytes());
    body.extend_from_slice(b"\r\n");
    body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());

    let request = Request::builder()
        .method("POST")
        .uri("/extract")
        .header("content-type", format!("multipart/form-data; boundary={}", boundary))
        .header("content-length", body.len())
        .body(Body::from(body))
        .expect("Failed to build request");

    let response = router.oneshot(request).await.expect("Request failed");

    println!("1MB text file extraction status: {}", response.status());

    assert_eq!(
        response.status(),
        StatusCode::OK,
        "1MB text file should extract successfully. If this fails, multipart parsing may have issues."
    );
}

/// Test extracting progressively larger text files to find breaking point.
///
/// This helps identify at what size the API starts failing.
#[tokio::test]
async fn test_find_size_breaking_point() {
    let sizes = vec![
        ("100KB", 100 * 1024),
        ("500KB", 500 * 1024),
        ("1MB", 1024 * 1024),
        ("2MB", 2 * 1024 * 1024),
        ("5MB", 5 * 1024 * 1024),
    ];

    for (label, size) in sizes {
        let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20));

        let boundary = "----size-test";
        let content = "A".repeat(size);

        let mut body = Vec::new();
        body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
        body.extend_from_slice(
            format!(
                "Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n",
                label
            )
            .as_bytes(),
        );
        body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
        body.extend_from_slice(content.as_bytes());
        body.extend_from_slice(b"\r\n");
        body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());

        let request = Request::builder()
            .method("POST")
            .uri("/extract")
            .header("content-type", format!("multipart/form-data; boundary={}", boundary))
            .header("content-length", body.len())
            .body(Body::from(body))
            .expect("Failed to build request");

        let response = router.oneshot(request).await.expect("Request failed");

        println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16());

        if response.status() != StatusCode::OK {
            eprintln!("Extraction failed at size: {}", label);

            let body = to_bytes(response.into_body(), 1_000_000)
                .await
                .expect("Failed to read response body");

            if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
                eprintln!(
                    "Error response: {}",
                    serde_json::to_string_pretty(&parsed).expect("Failed to parse")
                );
            } else {
                eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
            }

            return;
        }
    }
}

/// Test that the default 100MB limit is being applied.
///
/// Verifies that the server is actually respecting the configured limits,
/// and documents what the default limit actually is.
#[tokio::test]
async fn test_default_size_limits() {
    let default_limits = ApiSizeLimits::default();
    assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024);
    assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024);

    println!(
        "Default limits: {} bytes request, {} bytes per field",
        default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes
    );
}

/// Test that the router layer actually applies RequestBodyLimitLayer.
///
/// Creates a router and verifies that size limit enforcement is active.
#[tokio::test]
async fn test_request_body_limit_layer_applied() {
    let small_limits = ApiSizeLimits::from_mb(1, 1);
    let router = create_router_with_limits(ExtractionConfig::default(), small_limits);

    let boundary = "----exceed-limits";
    let large_content = "X".repeat(2 * 1024 * 1024);

    let mut body = Vec::new();
    body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());
    body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n");
    body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");
    body.extend_from_slice(large_content.as_bytes());
    body.extend_from_slice(b"\r\n");
    body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());

    let request = Request::builder()
        .method("POST")
        .uri("/extract")
        .header("content-type", format!("multipart/form-data; boundary={}", boundary))
        .header("content-length", body.len())
        .body(Body::from(body))
        .expect("Failed to build request");

    let response = router.oneshot(request).await.expect("Request failed");

    assert_eq!(
        response.status(),
        StatusCode::PAYLOAD_TOO_LARGE,
        "2MB file should be rejected when limit is 1MB"
    );
}

/// Test multipart parsing with incremental content.
///
/// Some implementations have issues with streaming multipart parsing.
/// This test uses proper CRLF line endings to ensure correct parsing.
#[tokio::test]
async fn test_multipart_proper_crlf_formatting() {
    let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));

    let content = "Test PDF content that is at least somewhat large for testing purposes.";

    let mut body = Vec::new();

    body.extend_from_slice(b"--BOUNDARY123456\r\n");

    body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n");
    body.extend_from_slice(b"Content-Type: application/pdf\r\n");

    body.extend_from_slice(b"\r\n");

    body.extend_from_slice(content.as_bytes());

    body.extend_from_slice(b"\r\n");

    body.extend_from_slice(b"--BOUNDARY123456--\r\n");

    let request = Request::builder()
        .method("POST")
        .uri("/extract")
        .header("content-type", "multipart/form-data; boundary=BOUNDARY123456")
        .header("content-length", body.len())
        .body(Body::from(body))
        .expect("Failed to build request");

    let response = router.oneshot(request).await.expect("Request failed");

    println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16());
    assert!(response.status().is_success() || response.status().is_client_error());
}

*/
Nomad changes 2026-06-01 23:40:55 +02:00			`//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises`
			`//! pub(crate) APIs that the migration deliberately narrowed; gated until`
			`//! either (a) these APIs are re-exposed publicly, or (b) the test is`
			`//! rewritten against the public extraction surface.`

			`#![cfg(any())]`

			`// Original content preserved below; recompiled once gating cfg drops.`
			`// Disabled by the file-level cfg(any()) above.`

			`/*`
			`#![cfg(feature = "api")]`
			`//! Diagnostic tests for large PDF file extraction issues.`
			`//!`
			`//! These tests are designed to isolate and identify the root cause of`
			`//! issues with large PDF file handling in the Kreuzberg API server.`
			`//!`
			`//! Current Status:`
			`//! - 5MB PDF tests are returning HTTP 400 instead of HTTP 200`
			`//! - This suggests either:`
			`//! a) The mock PDF structure is invalid`
			`//! b) The PDF extraction logic has issues with the generated content`
			`//! c) The multipart parsing is failing on large payloads`
			`//!`
			`//! These diagnostic tests help narrow down which component is failing.`

			`use axum::{`
			`body::{Body, to_bytes},`
			`http::{Request, StatusCode},`
			`};`
			`use kreuzberg::{`
			`ExtractionConfig,`
			`api::{ApiSizeLimits, create_router_with_limits},`
			`};`
			`use serde_json::Value;`
			`use tower::ServiceExt;`

			`/// Test extracting a minimal valid PDF (control test).`
			`///`
			`/// This serves as a baseline to verify the API can handle valid PDFs`
			`/// before testing with large files.`
			`#[tokio::test]`
			`async fn test_extract_minimal_valid_pdf() {`
			`let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));`

			`let pdf_content = b"%PDF-1.4`
			`1 0 obj`
			`<< /Type /Catalog /Pages 2 0 R >>`
			`endobj`
			`2 0 obj`
			`<< /Type /Pages /Kids [3 0 R] /Count 1 >>`
			`endobj`
			`3 0 obj`
			`<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>`
			`endobj`
			`4 0 obj`
			`<< >>`
			`stream`
			`BT /F1 12 Tf 50 750 Td (Hello) Tj ET`
			`endstream`
			`endobj`
			`xref`
			`0 5`
			`0000000000 65535 f`
			`0000000009 00000 n`
			`0000000074 00000 n`
			`0000000133 00000 n`
			`0000000214 00000 n`
			`trailer`
			`<< /Size 5 /Root 1 0 R >>`
			`startxref`
			`340`
			`%%EOF";`

			`let boundary = "----minimal-pdf";`
			`let mut body = Vec::new();`

			`body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());`
			`body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"minimal.pdf\"\r\n");`
			`body.extend_from_slice(b"Content-Type: application/pdf\r\n\r\n");`
			`body.extend_from_slice(pdf_content);`
			`body.extend_from_slice(b"\r\n");`
			`body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());`

			`let request = Request::builder()`
			`.method("POST")`
			`.uri("/extract")`
			`.header("content-type", format!("multipart/form-data; boundary={}", boundary))`
			`.header("content-length", body.len())`
			`.body(Body::from(body))`
			`.expect("Failed to build request");`

			`let response = router.oneshot(request).await.expect("Request failed");`

			`assert_eq!(`
			`response.status(),`
			`StatusCode::OK,`
			`"Minimal PDF should extract successfully. Status: {} indicates baseline is working",`
			`response.status()`
			`);`

			`let body = to_bytes(response.into_body(), 1_000_000)`
			`.await`
			`.expect("Failed to read response body");`

			`let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");`
			`eprintln!(`
			`"Extraction result: {}",`
			`serde_json::to_string_pretty(&parsed).expect("Failed to parse")`
			`);`
			`}`

			`/// Test extracting a 1MB text file (control test without PDF).`
			`///`
			`/// This isolates whether the issue is specific to PDF handling or`
			`/// a general problem with large multipart uploads.`
			`#[tokio::test]`
			`async fn test_extract_1mb_text_file() {`
			`let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));`

			`let boundary = "----large-text";`
			`let large_text = "This is test content. ".repeat(50000);`

			`let mut body = Vec::new();`
			`body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());`
			`body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"large.txt\"\r\n");`
			`body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");`
			`body.extend_from_slice(large_text.as_bytes());`
			`body.extend_from_slice(b"\r\n");`
			`body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());`

			`let request = Request::builder()`
			`.method("POST")`
			`.uri("/extract")`
			`.header("content-type", format!("multipart/form-data; boundary={}", boundary))`
			`.header("content-length", body.len())`
			`.body(Body::from(body))`
			`.expect("Failed to build request");`

			`let response = router.oneshot(request).await.expect("Request failed");`

			`println!("1MB text file extraction status: {}", response.status());`

			`assert_eq!(`
			`response.status(),`
			`StatusCode::OK,`
			`"1MB text file should extract successfully. If this fails, multipart parsing may have issues."`
			`);`
			`}`

			`/// Test extracting progressively larger text files to find breaking point.`
			`///`
			`/// This helps identify at what size the API starts failing.`
			`#[tokio::test]`
			`async fn test_find_size_breaking_point() {`
			`let sizes = vec![`
			`("100KB", 100 * 1024),`
			`("500KB", 500 * 1024),`
			`("1MB", 1024 * 1024),`
			`("2MB", 2 * 1024 * 1024),`
			`("5MB", 5 * 1024 * 1024),`
			`];`

			`for (label, size) in sizes {`
			`let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(20, 20));`

			`let boundary = "----size-test";`
			`let content = "A".repeat(size);`

			`let mut body = Vec::new();`
			`body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());`
			`body.extend_from_slice(`
			`format!(`
			`"Content-Disposition: form-data; name=\"files\"; filename=\"test_{}.txt\"\r\n",`
			`label`
			`)`
			`.as_bytes(),`
			`);`
			`body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");`
			`body.extend_from_slice(content.as_bytes());`
			`body.extend_from_slice(b"\r\n");`
			`body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());`

			`let request = Request::builder()`
			`.method("POST")`
			`.uri("/extract")`
			`.header("content-type", format!("multipart/form-data; boundary={}", boundary))`
			`.header("content-length", body.len())`
			`.body(Body::from(body))`
			`.expect("Failed to build request");`

			`let response = router.oneshot(request).await.expect("Request failed");`

			`println!("Size {} ({}B): HTTP {}", label, size, response.status().as_u16());`

			`if response.status() != StatusCode::OK {`
			`eprintln!("Extraction failed at size: {}", label);`

			`let body = to_bytes(response.into_body(), 1_000_000)`
			`.await`
			`.expect("Failed to read response body");`

			`if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {`
			`eprintln!(`
			`"Error response: {}",`
			`serde_json::to_string_pretty(&parsed).expect("Failed to parse")`
			`);`
			`} else {`
			`eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));`
			`}`

			`return;`
			`}`
			`}`
			`}`

			`/// Test that the default 100MB limit is being applied.`
			`///`
			`/// Verifies that the server is actually respecting the configured limits,`
			`/// and documents what the default limit actually is.`
			`#[tokio::test]`
			`async fn test_default_size_limits() {`
			`let default_limits = ApiSizeLimits::default();`
			`assert_eq!(default_limits.max_request_body_bytes, 100 * 1024 * 1024);`
			`assert_eq!(default_limits.max_multipart_field_bytes, 100 * 1024 * 1024);`

			`println!(`
			`"Default limits: {} bytes request, {} bytes per field",`
			`default_limits.max_request_body_bytes, default_limits.max_multipart_field_bytes`
			`);`
			`}`

			`/// Test that the router layer actually applies RequestBodyLimitLayer.`
			`///`
			`/// Creates a router and verifies that size limit enforcement is active.`
			`#[tokio::test]`
			`async fn test_request_body_limit_layer_applied() {`
			`let small_limits = ApiSizeLimits::from_mb(1, 1);`
			`let router = create_router_with_limits(ExtractionConfig::default(), small_limits);`

			`let boundary = "----exceed-limits";`
			`let large_content = "X".repeat(2 * 1024 * 1024);`

			`let mut body = Vec::new();`
			`body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes());`
			`body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.txt\"\r\n");`
			`body.extend_from_slice(b"Content-Type: text/plain\r\n\r\n");`
			`body.extend_from_slice(large_content.as_bytes());`
			`body.extend_from_slice(b"\r\n");`
			`body.extend_from_slice(format!("--{}--\r\n", boundary).as_bytes());`

			`let request = Request::builder()`
			`.method("POST")`
			`.uri("/extract")`
			`.header("content-type", format!("multipart/form-data; boundary={}", boundary))`
			`.header("content-length", body.len())`
			`.body(Body::from(body))`
			`.expect("Failed to build request");`

			`let response = router.oneshot(request).await.expect("Request failed");`

			`assert_eq!(`
			`response.status(),`
			`StatusCode::PAYLOAD_TOO_LARGE,`
			`"2MB file should be rejected when limit is 1MB"`
			`);`
			`}`

			`/// Test multipart parsing with incremental content.`
			`///`
			`/// Some implementations have issues with streaming multipart parsing.`
			`/// This test uses proper CRLF line endings to ensure correct parsing.`
			`#[tokio::test]`
			`async fn test_multipart_proper_crlf_formatting() {`
			`let router = create_router_with_limits(ExtractionConfig::default(), ApiSizeLimits::from_mb(10, 10));`

			`let content = "Test PDF content that is at least somewhat large for testing purposes.";`

			`let mut body = Vec::new();`

			`body.extend_from_slice(b"--BOUNDARY123456\r\n");`

			`body.extend_from_slice(b"Content-Disposition: form-data; name=\"files\"; filename=\"test.pdf\"\r\n");`
			`body.extend_from_slice(b"Content-Type: application/pdf\r\n");`

			`body.extend_from_slice(b"\r\n");`

			`body.extend_from_slice(content.as_bytes());`

			`body.extend_from_slice(b"\r\n");`

			`body.extend_from_slice(b"--BOUNDARY123456--\r\n");`

			`let request = Request::builder()`
			`.method("POST")`
			`.uri("/extract")`
			`.header("content-type", "multipart/form-data; boundary=BOUNDARY123456")`
			`.header("content-length", body.len())`
			`.body(Body::from(body))`
			`.expect("Failed to build request");`

			`let response = router.oneshot(request).await.expect("Request failed");`

			`println!("Multipart with proper CRLF: HTTP {}", response.status().as_u16());`
			`assert!(response.status().is_success() \|\| response.status().is_client_error());`
			`}`

			`*/`