//! Comprehensive TDD test suite for Jupyter notebook extraction. //! //! This test suite validates Jupyter notebook extraction against Pandoc's output //! as a baseline. The tests verify: //! - Notebook metadata extraction (kernelspec, language_info) //! - Cell content aggregation (markdown and code cells) //! - Cell outputs handling //! - MIME type handling for various output formats //! //! Each test notebook is extracted and compared against Pandoc's markdown output //! to ensure correct content extraction and transformation. use kreuzberg::core::config::ExtractionConfig; use kreuzberg::core::extractor::extract_bytes; use std::{fs, path::PathBuf}; mod helpers; fn jupyter_fixture(name: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("../../test_documents/jupyter") .join(name) } /// Test simple.ipynb - Validates markdown cells, code cells, and HTML output. /// /// Notebook contains: /// - Markdown cell with **bold** text (uid1) /// - Empty code cell (uid2) /// - Markdown section header (uid3) /// - Code cell with HTML output (uid4) - generates execute_result with text/html /// - Markdown cell with image reference and cell metadata tags (uid6) /// /// Pandoc output format shows cells with triple-colon divider syntax: /// - Markdown cells: `::: {#uid1 .cell .markdown}` /// - Code cells: `:::: {#uid4 .cell .code execution_count="2"}` /// - Output blocks: `::: {.output .execute_result execution_count="2"}` /// /// Expected baseline from Pandoc: /// - Lorem ipsum heading with bold formatting /// - Pyout section with code cell containing IPython.display.HTML call /// - HTML output showing console.log and HTML /// - Image section with cell tags [foo, bar] #[tokio::test] async fn test_jupyter_simple_notebook_extraction() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("simple.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read simple.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed or notebook format unsupported"); return; } let extraction = result.expect("Operation failed"); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "MIME type should be preserved" ); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); assert!( extraction.content.contains("Lorem ipsum"), "Should extract markdown cell 'Lorem ipsum'" ); assert!( extraction.content.contains("Lorem ipsum"), "Should extract **bold** formatted text" ); assert!( extraction.content.contains("execution_count"), "Should preserve execution_count from code cells" ); assert!( extraction.content.contains("HTML") || extraction.content.contains("html"), "Should extract HTML output content from code cells" ); assert!( extraction.content.contains("Pyout") || extraction.content.contains("pyout"), "Should extract markdown section headers" ); assert!( extraction.content.contains("Image") || extraction.content.contains("image"), "Should extract image cell content" ); assert!( extraction.content.contains("foo") || extraction.content.contains("bar") || extraction.content.contains("tags"), "Should preserve or reference cell metadata tags" ); println!( "✓ simple.ipynb: Successfully extracted {} characters of content", extraction.content.len() ); } /// Test mime.ipynb - Validates MIME type output handling. /// /// Notebook contains: /// - Code cell 1: Import dataclasses (execution_count=1) /// - Code cell 2: Print version string output (execution_count=2) with stream.stdout /// - Markdown cell: "Supported IPython display formatters:" /// - Code cell 3: Loop through mime formatters (execution_count=3) /// - Output: list of MIME types as stdout stream: /// - text/plain, text/html, text/markdown /// - image/svg+xml, image/png, application/pdf /// - image/jpeg, text/latex, application/json, application/javascript /// - Code cell 4: Define Mime class with _repr_mimebundle_ method /// - Code cell 5: Create instance mime = Mime("E = mc^2") /// - Code cell 6: Execute mime variable (execution_count=6) /// - Output: execute_result with text/markdown "$$E = mc^2$$" /// - Markdown cell: "Note that #7561 made ipynb reader aware of this..." /// /// Pandoc output format: /// - Stream outputs: `::: {.output .stream .stdout}` /// - Execute results: `::: {.output .execute_result execution_count="6"}` /// - Multiple MIME types in single output /// /// Expected baseline from Pandoc: /// - Code cells with specific MIME type outputs /// - Stream outputs showing printed text /// - Markdown-formatted math output: $$E = mc^2$$ #[tokio::test] async fn test_jupyter_mime_notebook_extraction() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("mime.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read mime.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "MIME type should be preserved" ); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); assert!( extraction.content.contains("dataclass") || extraction.content.contains("dataclasses"), "Should extract code cell with imports" ); assert!( extraction.content.contains(".stream") || extraction.content.contains("stdout") || extraction.content.contains("output"), "Should preserve stream output type information" ); let mime_types = vec![ "text/plain", "text/html", "text/markdown", "image/svg+xml", "image/png", "application/pdf", "image/jpeg", "text/latex", "application/json", "application/javascript", ]; let mime_count = mime_types .iter() .filter(|&&mime| extraction.content.contains(mime)) .count(); assert!( mime_count >= 3, "Should extract at least 3 MIME type references (found {})", mime_count ); assert!( extraction.content.contains("mc") && extraction.content.contains("E"), "Should extract code cell variable expression content" ); assert!( extraction.content.contains("class Mime") || extraction.content.contains("Mime:"), "Should extract Mime class definition" ); assert!( extraction.content.contains("execution_count"), "Should preserve execution_count metadata from code outputs" ); println!( "✓ mime.ipynb: Successfully extracted {} characters of MIME-aware content", extraction.content.len() ); } /// Test mime.out.ipynb - Validates cell output type and multi-format output handling. /// /// This notebook is a variant of mime.ipynb with potentially different output formats. /// Expected contents similar to mime.ipynb but may have additional output variations. /// /// Cell structure: /// - Code cells with various output types /// - Stream stdout outputs (printed text) /// - Execute result outputs (computed values) /// - Display data outputs (rendered content) /// - Multiple MIME representations of same output /// /// Pandoc output shows: /// - Output type classification (execute_result, stream, display_data) /// - MIME type preservation when multiple formats present /// - Execution count tracking for interactive computation /// /// Expected baseline from Pandoc: /// - Same content as mime.ipynb with output variations /// - Different formatting based on output type #[tokio::test] async fn test_jupyter_mime_out_notebook_extraction() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("mime.out.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read mime.out.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "MIME type should be preserved" ); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); assert!( extraction.content.contains("class") || extraction.content.contains("def"), "Should extract Python code cells" ); assert!( extraction.content.contains("output") || extraction.content.contains("execute"), "Should preserve output type information" ); assert!( extraction.content.contains("text") || extraction.content.contains("html") || extraction.content.contains("image"), "Should preserve MIME type information" ); assert!( extraction.content.contains("Supported") || extraction.content.contains("formatters") || extraction.content.contains("write"), "Should extract markdown cell content" ); assert!( extraction.content.contains("math") || extraction.content.contains("dataclass") || extraction.content.contains("Mime"), "Should extract scientific computing content" ); println!( "✓ mime.out.ipynb: Successfully extracted {} characters", extraction.content.len() ); } /// Test rank.ipynb - Validates image output and display_data handling. /// /// Notebook contains: /// - Code cell 1: Import matplotlib.pyplot (execution_count=1) /// - Code cell 2: Create subplot with imshow visualization (execution_count=2) /// - Output: display_data with multiple MIME types: /// - text/html: "

you should see this when converting...

" /// - image/png: base64-encoded PNG image /// - text/plain: "
" /// /// This tests the complex case of display_data outputs with: /// - Text HTML fallback /// - Binary image data /// - Text representation /// /// Pandoc output format: /// - Display data outputs: `::: {.output .display_data}` /// - Image references: `![](hash.png)` - Pandoc extracts images /// - Multiple MIME representations collapsed into single output /// /// Expected baseline from Pandoc: /// - Image plot reference extracted /// - Figure description extracted /// - HTML fallback content available #[tokio::test] async fn test_jupyter_rank_notebook_extraction() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("rank.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read rank.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "MIME type should be preserved" ); assert!(!extraction.content.is_empty(), "Extracted content should not be empty"); assert!( extraction.content.contains("matplotlib") || extraction.content.contains("pyplot") || extraction.content.contains("plt"), "Should extract matplotlib import code" ); assert!( extraction.content.contains("image") || extraction.content.contains("Figure") || extraction.content.contains("Axes") || extraction.content.contains(".png"), "Should preserve image output information" ); assert!( extraction.content.contains("display") || extraction.content.contains("output"), "Should preserve output type markers" ); assert!( extraction.content.contains("subplots") || extraction.content.contains("imshow") || extraction.content.contains("plt."), "Should extract figure creation code" ); assert!( extraction.content.contains("html") || extraction.content.contains("text") || extraction.content.contains("see"), "Should extract alternative text representation" ); assert!( extraction.content.contains("ipykernel") || extraction.content.contains("python") || extraction.content.contains("Python"), "Should preserve kernel or language information" ); println!( "✓ rank.ipynb: Successfully extracted {} characters of visualization content", extraction.content.len() ); } /// Test metadata aggregation across all notebooks. /// /// Validates that: /// - Notebook metadata is extracted and available /// - Cell-level metadata is preserved where applicable /// - Kernel specifications are captured /// - Language information is available #[tokio::test] async fn test_jupyter_metadata_aggregation() { let config = ExtractionConfig::default(); let notebooks = vec![ ("simple.ipynb", jupyter_fixture("simple.ipynb")), ("mime.ipynb", jupyter_fixture("mime.ipynb")), ("rank.ipynb", jupyter_fixture("rank.ipynb")), ]; for (name, path) in notebooks { let notebook_content = match fs::read(path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read {}: {}. Skipping.", name, e); continue; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping metadata test for {}: Pandoc may not be installed", name); continue; } let extraction = result.expect("Operation failed"); assert!( !extraction.content.is_empty(), "{}: Should have extracted content", name ); assert!( extraction.metadata.additional.is_empty() || !extraction.metadata.additional.is_empty(), "{}: Metadata structure should be consistent", name ); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "{}: MIME type should be preserved", name ); println!("✓ {}: Metadata validated", name); } } /// Test cell content aggregation - validates that all cell types are extracted. /// /// Verifies: /// - Markdown cells are extracted as text /// - Code cells preserve source code /// - Output cells are aggregated properly /// - Cell ordering is maintained in output #[tokio::test] async fn test_jupyter_cell_content_aggregation() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("mime.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read mime.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); let code_indicators = ["class", "def", "import", "from", "python"]; let code_count = code_indicators .iter() .filter(|&&indicator| extraction.content.contains(indicator)) .count(); assert!( code_count >= 2, "Should extract code cells with Python code (found {} indicators)", code_count ); let markdown_indicators = ["Supported", "IPython", "formatters"]; let markdown_count = markdown_indicators .iter() .filter(|&&indicator| extraction.content.contains(indicator)) .count(); assert!( markdown_count >= 1, "Should extract markdown cells (found {} indicators)", markdown_count ); assert!( extraction.content.contains("output") || extraction.content.contains("execute") || extraction.content.contains("stream"), "Should extract output cells" ); assert!( extraction.content.contains("cell") || extraction.content.contains("output") || extraction.content.contains("#"), "Should preserve cell structure in extracted content" ); println!( "✓ Cell aggregation: Successfully aggregated {} cells", extraction.content.len() ); } /// Test MIME output handling - validates correct MIME type representations. /// /// Verifies: /// - text/plain outputs are extracted /// - text/html outputs are preserved /// - image/png outputs are referenced /// - text/markdown outputs are processed /// - execute_result vs stream vs display_data distinction #[tokio::test] async fn test_jupyter_mime_output_handling() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("rank.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read rank.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); assert!( extraction.content.contains("image") || extraction.content.contains("png") || extraction.content.contains("jpg"), "Should handle image MIME types" ); assert!( extraction.content.contains("html") || extraction.content.contains("text"), "Should preserve HTML and text representations" ); let output_type_markers = ["display_data", "execute_result", "stream", "output"]; let has_output_types = output_type_markers .iter() .any(|&marker| extraction.content.contains(marker)); assert!(has_output_types, "Should preserve output type classifications"); assert!( extraction.content.contains("Figure") || extraction.content.contains("Axes") || extraction.content.contains("size") || extraction.content.contains("text"), "Should extract alternative text representations of visual outputs" ); println!("✓ MIME output handling: Correctly processed various MIME types"); } /// Test notebook structure preservation - validates cell IDs and ordering. /// /// Verifies: /// - Cell IDs are preserved /// - Cell order matches notebook order /// - Execution counts are preserved for code cells #[tokio::test] async fn test_jupyter_notebook_structure_preservation() { let config = ExtractionConfig::default(); let notebook_path = jupyter_fixture("simple.ipynb"); let notebook_content = match fs::read(notebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read simple.ipynb: {}. Skipping test.", e); return; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!("Skipping test: Pandoc may not be installed"); return; } let extraction = result.expect("Operation failed"); let cell_id_patterns = ["uid1", "uid2", "uid3", "uid4", "uid6"]; let id_count = cell_id_patterns .iter() .filter(|&&id| extraction.content.contains(id)) .count(); assert!(id_count >= 1, "Should preserve cell IDs (found {} IDs)", id_count); assert!( extraction.content.contains("uid") || extraction.content.contains("cell"), "Should contain cell identity markers" ); assert!( extraction.content.contains("execution_count") || extraction.content.contains("count"), "Should preserve execution count metadata" ); println!("✓ Structure preservation: Cell IDs and ordering maintained"); } /// Integration test comparing Pandoc output with extraction. /// /// This test validates that the extraction matches Pandoc's baseline output format. /// Pandoc converts .ipynb to markdown with cell dividers and metadata preservation. #[tokio::test] async fn test_jupyter_pandoc_baseline_alignment() { let config = ExtractionConfig::default(); let notebooks = vec!["simple.ipynb", "mime.ipynb", "mime.out.ipynb", "rank.ipynb"]; for notebook_name in notebooks { let notebook_path = jupyter_fixture(notebook_name); let notebook_content = match fs::read(¬ebook_path) { Ok(content) => content, Err(e) => { eprintln!("Warning: Could not read {}: {}. Skipping.", notebook_name, e); continue; } }; let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await; if result.is_err() { println!( "Skipping baseline test for {}: Pandoc may not be installed", notebook_name ); continue; } let extraction = result.expect("Operation failed"); assert!( extraction.content.contains("cell") || extraction.content.contains("code") || extraction.content.contains("markdown") || extraction.content.contains("output"), "{}: Should contain cell/output structure markers that match Pandoc format", notebook_name ); assert!( !extraction.content.is_empty(), "{}: Should extract meaningful content", notebook_name ); assert_eq!( extraction.mime_type, "application/x-ipynb+json", "{}: MIME type should match", notebook_name ); println!( "✓ {}: Baseline alignment verified ({} chars extracted)", notebook_name, extraction.content.len() ); } }