fil/crates/kreuzberg/tests/pdf_hierarchy_detection.rs

//! Integration tests for PDF text hierarchy detection.
//!
//! Tests the extraction and detection of document hierarchy levels (H1-H6)
//! from PDF text using font size clustering and semantic analysis.

#![cfg(feature = "pdf")]

use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig};
use kreuzberg::extract_bytes;
use std::path::Path;

// Note: All tests must run serially because Pdfium can only be initialized once.
// Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate.
// For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error.

/// Test full hierarchy extraction from a real PDF.
///
/// Loads a PDF from test data directory, extracts with hierarchy detection enabled,
/// and verifies that PageContent.hierarchy is properly populated with expected
/// blocks and hierarchy levels.
#[tokio::test]
async fn test_full_hierarchy_extraction() {
    // Use the embedded_images_tables.pdf which has clear text structure
    // Path is relative to workspace root, not crate root
    let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";

    if !Path::new(pdf_path).exists() {
        eprintln!("Test PDF not found at: {}", pdf_path);
        // Skip the test if PDF doesn't exist
        return;
    }

    let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");

    // Create extraction config with hierarchy detection enabled
    let config = ExtractionConfig {
        pages: Some(PageConfig {
            extract_pages: true,
            ..Default::default()
        }),
        pdf_options: Some(PdfConfig {
            extract_images: false,
            extract_tables: true,
            passwords: None,
            extract_metadata: true,
            hierarchy: Some(HierarchyConfig {
                enabled: true,
                k_clusters: 6,
                include_bbox: true,
                ocr_coverage_threshold: None,
            }),
            extract_annotations: false,
            ..Default::default()
        }),
        ..Default::default()
    };

    // Extract the PDF
    let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
        .await
        .expect("PDF extraction failed");

    // Verify that pages were extracted
    assert!(
        result.pages.is_some(),
        "Pages should be extracted when extract_pages is enabled"
    );

    let pages = result.pages.as_ref().expect("Operation failed");
    assert!(!pages.is_empty(), "At least one page should be extracted");

    // Check that the first page has hierarchy information
    let first_page = &pages[0];
    assert!(
        first_page.hierarchy.is_some(),
        "First page should have hierarchy information when hierarchy extraction is enabled"
    );

    let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");

    // Verify hierarchy structure
    assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
    assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty");

    eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count);

    // Verify that we have multiple hierarchy levels
    let levels: std::collections::HashSet<String> = hierarchy.blocks.iter().map(|b| b.level.clone()).collect();

    eprintln!("Found hierarchy levels: {:?}", levels);

    // Should have at least 1 level
    assert!(!levels.is_empty(), "Should have at least one hierarchy level");

    // Verify block structure
    for block in &hierarchy.blocks {
        assert!(!block.text.is_empty(), "Block text should not be empty");
        assert!(block.font_size > 0.0, "Font size should be positive");

        // Check that level is a valid heading level or body
        let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body");
        assert!(is_valid_level, "Invalid hierarchy level: {}", block.level);

        // Verify bounding box if present
        if let Some((left, top, right, bottom)) = block.bbox {
            assert!(left < right, "Bounding box left should be less than right");
            assert!(top < bottom, "Bounding box top should be less than bottom");
            assert!(
                left >= 0.0 && top >= 0.0,
                "Bounding box coordinates should be non-negative"
            );
            eprintln!(
                "Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})",
                block.text.chars().take(30).collect::<String>(),
                block.level,
                block.font_size,
                left,
                top,
                right,
                bottom
            );
        } else {
            eprintln!(
                "Block '{}' (level: {}, font_size: {}) no bbox",
                block.text.chars().take(30).collect::<String>(),
                block.level,
                block.font_size
            );
        }
    }

    eprintln!("Hierarchy extraction test passed!");
}

/// Test that hierarchy extraction respects the enabled flag.
/// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints.
#[tokio::test]
#[ignore]
async fn test_hierarchy_disabled() {
    let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";

    if !Path::new(pdf_path).exists() {
        eprintln!("Test PDF not found at: {}", pdf_path);
        return;
    }

    let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");

    // Create extraction config with hierarchy detection disabled
    let config = ExtractionConfig {
        pages: Some(PageConfig {
            extract_pages: true,
            ..Default::default()
        }),
        pdf_options: Some(PdfConfig {
            extract_images: false,
            extract_tables: true,
            passwords: None,
            extract_metadata: true,
            hierarchy: Some(HierarchyConfig {
                enabled: false,
                k_clusters: 6,
                include_bbox: true,
                ocr_coverage_threshold: None,
            }),
            extract_annotations: false,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
        .await
        .expect("PDF extraction failed");

    // Verify that pages were extracted
    assert!(result.pages.is_some(), "Pages should be extracted");

    let pages = result.pages.as_ref().expect("Operation failed");
    assert!(!pages.is_empty(), "At least one page should be extracted");

    // Check that the first page does NOT have hierarchy information when disabled
    let first_page = &pages[0];
    assert!(
        first_page.hierarchy.is_none(),
        "First page should not have hierarchy when hierarchy extraction is disabled"
    );

    eprintln!("Hierarchy disabled test passed!");
}

/// Test different hierarchy configurations
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
#[tokio::test]
#[ignore]
async fn test_hierarchy_with_explicit_disabled() {
    let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";

    if !Path::new(pdf_path).exists() {
        eprintln!("Test PDF not found at: {}", pdf_path);
        return;
    }

    let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");

    // Create extraction config with hierarchy extraction explicitly disabled
    let config = ExtractionConfig {
        pages: Some(PageConfig {
            extract_pages: true,
            ..Default::default()
        }),
        pdf_options: Some(PdfConfig {
            extract_images: false,
            extract_tables: true,
            passwords: None,
            extract_metadata: true,
            hierarchy: Some(HierarchyConfig {
                enabled: false,
                k_clusters: 6,
                include_bbox: true,
                ocr_coverage_threshold: None,
            }),
            extract_annotations: false,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
        .await
        .expect("PDF extraction failed");

    // Verify that pages were extracted
    assert!(result.pages.is_some(), "Pages should be extracted");

    let pages = result.pages.as_ref().expect("Operation failed");
    assert!(!pages.is_empty(), "At least one page should be extracted");

    // Check that the first page does NOT have hierarchy information when disabled
    let first_page = &pages[0];
    assert!(
        first_page.hierarchy.is_none(),
        "First page should not have hierarchy when hierarchy extraction is disabled"
    );

    eprintln!("Hierarchy with explicit disabled test passed!");
}

/// Test hierarchy extraction with different cluster configurations.
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
#[tokio::test]
#[ignore]
async fn test_hierarchy_different_k_clusters() {
    let pdf_path = "../../test_documents/pdf/embedded_images_tables.pdf";

    if !Path::new(pdf_path).exists() {
        eprintln!("Test PDF not found at: {}", pdf_path);
        return;
    }

    let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");

    // Test with different k values
    for k in &[2, 4, 6] {
        let config = ExtractionConfig {
            pages: Some(PageConfig {
                extract_pages: true,
                ..Default::default()
            }),
            pdf_options: Some(PdfConfig {
                extract_images: false,
                extract_tables: true,
                passwords: None,
                extract_metadata: true,
                hierarchy: Some(HierarchyConfig {
                    enabled: true,
                    k_clusters: *k,
                    include_bbox: true,
                    ocr_coverage_threshold: None,
                }),
                extract_annotations: false,
                ..Default::default()
            }),
            ..Default::default()
        };

        let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
            .await
            .expect("PDF extraction failed");

        assert!(result.pages.is_some(), "Pages should be extracted");

        let pages = result.pages.as_ref().expect("Operation failed");
        assert!(!pages.is_empty(), "At least one page should be extracted");

        let first_page = &pages[0];
        assert!(
            first_page.hierarchy.is_some(),
            "Hierarchy should be present with k={}",
            k
        );

        let hierarchy = first_page.hierarchy.as_ref().expect("Operation failed");
        eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
        assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
    }

    eprintln!("Different k_clusters test passed!");
}