199 lines
7.1 KiB
Rust
199 lines
7.1 KiB
Rust
|
|
//! TODO: Restored from 245539484 alef-migration cleanup. Currently exercises
|
|||
|
|
//! pub(crate) APIs that the migration deliberately narrowed; gated until
|
|||
|
|
//! either (a) these APIs are re-exposed publicly, or (b) the test is
|
|||
|
|
//! rewritten against the public extraction surface.
|
|||
|
|
|
|||
|
|
#![cfg(any())]
|
|||
|
|
|
|||
|
|
// Original content preserved below; recompiled once gating cfg drops.
|
|||
|
|
// Disabled by the file-level cfg(any()) above.
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
//! End-to-end integration test for XLSX metadata extraction
|
|||
|
|
#![cfg(feature = "excel")]
|
|||
|
|
|
|||
|
|
use kreuzberg::extraction::excel::{excel_to_markdown, excel_to_text, read_excel_file};
|
|||
|
|
|
|||
|
|
#[test]
|
|||
|
|
fn test_xlsx_full_metadata_extraction() {
|
|||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed")
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed");
|
|||
|
|
let test_file = workspace_root.join("test_documents/xlsx/excel_tiny_excel.xlsx");
|
|||
|
|
|
|||
|
|
if !test_file.exists() {
|
|||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|||
|
|
let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
|
|||
|
|
|
|||
|
|
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|||
|
|
|
|||
|
|
assert!(result.metadata.contains_key("sheet_count"), "Should have sheet count");
|
|||
|
|
|
|||
|
|
println!("✅ XLSX metadata extraction test passed!");
|
|||
|
|
println!(" Found {} metadata fields", result.metadata.len());
|
|||
|
|
for (key, value) in &result.metadata {
|
|||
|
|
println!(" {}: {}", key, value);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[test]
|
|||
|
|
fn test_xlsx_multi_sheet_metadata() {
|
|||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed")
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed");
|
|||
|
|
let test_file = workspace_root.join("test_documents/xlsx/excel_multi_sheet.xlsx");
|
|||
|
|
|
|||
|
|
if !test_file.exists() {
|
|||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|||
|
|
let result = read_excel_file(file_path).expect("Should extract multi-sheet XLSX successfully");
|
|||
|
|
|
|||
|
|
assert!(
|
|||
|
|
result.sheets.len() > 1,
|
|||
|
|
"Should have multiple sheets, got {}",
|
|||
|
|
result.sheets.len()
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
assert!(
|
|||
|
|
result.metadata.contains_key("sheet_names"),
|
|||
|
|
"Should have sheet_names metadata"
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
println!("✅ XLSX multi-sheet metadata extraction test passed!");
|
|||
|
|
println!(" Found {} sheets", result.sheets.len());
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[test]
|
|||
|
|
fn test_xlsx_minimal_metadata_extraction() {
|
|||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed")
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed");
|
|||
|
|
let test_file = workspace_root.join("test_documents/xlsx/tables_small_test01.xlsx");
|
|||
|
|
|
|||
|
|
if !test_file.exists() {
|
|||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|||
|
|
let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
|
|||
|
|
|
|||
|
|
assert!(!result.sheets.is_empty(), "Content should not be empty");
|
|||
|
|
assert!(
|
|||
|
|
result.metadata.contains_key("sheet_count"),
|
|||
|
|
"Should have basic sheet metadata"
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
println!("✅ XLSX minimal metadata extraction test passed!");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
|
|||
|
|
///
|
|||
|
|
/// This test reproduces the issue where Excel Solver stores configuration data
|
|||
|
|
/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
|
|||
|
|
/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
|
|||
|
|
/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
|
|||
|
|
///
|
|||
|
|
/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
|
|||
|
|
/// The file is only 6.8KB and contains minimal actual data.
|
|||
|
|
#[test]
|
|||
|
|
fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
|
|||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed")
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed");
|
|||
|
|
let test_file = workspace_root.join("test_documents/test_fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
|
|||
|
|
|
|||
|
|
if !test_file.exists() {
|
|||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
|||
|
|
println!("Run: node test_documents/test_fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|||
|
|
|
|||
|
|
// This should NOT cause OOM even though dimension claims A1:XFD1048575
|
|||
|
|
// The actual data is minimal (only ~26 cells with Solver metadata)
|
|||
|
|
let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
|
|||
|
|
|
|||
|
|
// Verify we got the actual data, not a massive allocation
|
|||
|
|
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|||
|
|
|
|||
|
|
// The file has normal cells A1, B1 plus Solver cells at extreme positions
|
|||
|
|
// Verify we extracted something reasonable, not 17 trillion cells
|
|||
|
|
let sheet = &result.sheets[0];
|
|||
|
|
assert!(
|
|||
|
|
sheet.markdown.len() < 10000,
|
|||
|
|
"Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
|
|||
|
|
sheet.markdown.len()
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
// Verify metadata was extracted
|
|||
|
|
assert!(
|
|||
|
|
result.metadata.contains_key("sheet_count"),
|
|||
|
|
"Should have sheet_count metadata"
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
println!("✅ XLSX Excel Solver extreme dimensions test passed!");
|
|||
|
|
println!(
|
|||
|
|
" Sheet markdown length: {} chars (reasonable size)",
|
|||
|
|
sheet.markdown.len()
|
|||
|
|
);
|
|||
|
|
println!(" Successfully handled dimension A1:XFD1048575 without OOM");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Regression test for #405: XLSX extraction with output_format=Markdown
|
|||
|
|
/// should produce markdown tables with pipe delimiters and separator rows,
|
|||
|
|
/// not plain space-separated text.
|
|||
|
|
#[test]
|
|||
|
|
fn test_xlsx_markdown_vs_plain_output() {
|
|||
|
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed")
|
|||
|
|
.parent()
|
|||
|
|
.expect("Operation failed");
|
|||
|
|
let test_file = workspace_root.join("test_documents/xlsx/excel_multi_sheet.xlsx");
|
|||
|
|
|
|||
|
|
if !test_file.exists() {
|
|||
|
|
println!("Skipping test: Test file not found at {:?}", test_file);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|||
|
|
let workbook = read_excel_file(file_path).expect("Should extract XLSX successfully");
|
|||
|
|
|
|||
|
|
// excel_to_markdown should produce tables with | delimiters
|
|||
|
|
let md_content = excel_to_markdown(&workbook);
|
|||
|
|
assert!(
|
|||
|
|
md_content.contains("| "),
|
|||
|
|
"Markdown output should contain table pipe delimiters"
|
|||
|
|
);
|
|||
|
|
assert!(
|
|||
|
|
md_content.contains("---"),
|
|||
|
|
"Markdown output should contain separator rows"
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
// excel_to_text should produce space-separated text (no pipes)
|
|||
|
|
let text_content = excel_to_text(&workbook);
|
|||
|
|
assert!(
|
|||
|
|
!text_content.contains("| "),
|
|||
|
|
"Plain text output should not contain table pipe delimiters"
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
*/
|