705 lines
23 KiB
Rust
705 lines
23 KiB
Rust
//! Comprehensive TDD test suite for Jupyter notebook extraction.
|
|
//!
|
|
//! This test suite validates Jupyter notebook extraction against Pandoc's output
|
|
//! as a baseline. The tests verify:
|
|
//! - Notebook metadata extraction (kernelspec, language_info)
|
|
//! - Cell content aggregation (markdown and code cells)
|
|
//! - Cell outputs handling
|
|
//! - MIME type handling for various output formats
|
|
//!
|
|
//! Each test notebook is extracted and compared against Pandoc's markdown output
|
|
//! to ensure correct content extraction and transformation.
|
|
|
|
use kreuzberg::core::config::ExtractionConfig;
|
|
use kreuzberg::core::extractor::extract_bytes;
|
|
use std::{fs, path::PathBuf};
|
|
|
|
mod helpers;
|
|
|
|
fn jupyter_fixture(name: &str) -> PathBuf {
|
|
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
.join("../../test_documents/jupyter")
|
|
.join(name)
|
|
}
|
|
|
|
/// Test simple.ipynb - Validates markdown cells, code cells, and HTML output.
|
|
///
|
|
/// Notebook contains:
|
|
/// - Markdown cell with **bold** text (uid1)
|
|
/// - Empty code cell (uid2)
|
|
/// - Markdown section header (uid3)
|
|
/// - Code cell with HTML output (uid4) - generates execute_result with text/html
|
|
/// - Markdown cell with image reference and cell metadata tags (uid6)
|
|
///
|
|
/// Pandoc output format shows cells with triple-colon divider syntax:
|
|
/// - Markdown cells: `::: {#uid1 .cell .markdown}`
|
|
/// - Code cells: `:::: {#uid4 .cell .code execution_count="2"}`
|
|
/// - Output blocks: `::: {.output .execute_result execution_count="2"}`
|
|
///
|
|
/// Expected baseline from Pandoc:
|
|
/// - Lorem ipsum heading with bold formatting
|
|
/// - Pyout section with code cell containing IPython.display.HTML call
|
|
/// - HTML output showing console.log and <b>HTML</b>
|
|
/// - Image section with cell tags [foo, bar]
|
|
#[tokio::test]
|
|
async fn test_jupyter_simple_notebook_extraction() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("simple.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read simple.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed or notebook format unsupported");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"MIME type should be preserved"
|
|
);
|
|
|
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
|
|
assert!(
|
|
extraction.content.contains("Lorem ipsum"),
|
|
"Should extract markdown cell 'Lorem ipsum'"
|
|
);
|
|
assert!(
|
|
extraction.content.contains("Lorem ipsum"),
|
|
"Should extract **bold** formatted text"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("execution_count"),
|
|
"Should preserve execution_count from code cells"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("HTML") || extraction.content.contains("html"),
|
|
"Should extract HTML output content from code cells"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("Pyout") || extraction.content.contains("pyout"),
|
|
"Should extract markdown section headers"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("Image") || extraction.content.contains("image"),
|
|
"Should extract image cell content"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("foo") || extraction.content.contains("bar") || extraction.content.contains("tags"),
|
|
"Should preserve or reference cell metadata tags"
|
|
);
|
|
|
|
println!(
|
|
"✓ simple.ipynb: Successfully extracted {} characters of content",
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
|
|
/// Test mime.ipynb - Validates MIME type output handling.
|
|
///
|
|
/// Notebook contains:
|
|
/// - Code cell 1: Import dataclasses (execution_count=1)
|
|
/// - Code cell 2: Print version string output (execution_count=2) with stream.stdout
|
|
/// - Markdown cell: "Supported IPython display formatters:"
|
|
/// - Code cell 3: Loop through mime formatters (execution_count=3)
|
|
/// - Output: list of MIME types as stdout stream:
|
|
/// - text/plain, text/html, text/markdown
|
|
/// - image/svg+xml, image/png, application/pdf
|
|
/// - image/jpeg, text/latex, application/json, application/javascript
|
|
/// - Code cell 4: Define Mime class with _repr_mimebundle_ method
|
|
/// - Code cell 5: Create instance mime = Mime("E = mc^2")
|
|
/// - Code cell 6: Execute mime variable (execution_count=6)
|
|
/// - Output: execute_result with text/markdown "$$E = mc^2$$"
|
|
/// - Markdown cell: "Note that #7561 made ipynb reader aware of this..."
|
|
///
|
|
/// Pandoc output format:
|
|
/// - Stream outputs: `::: {.output .stream .stdout}`
|
|
/// - Execute results: `::: {.output .execute_result execution_count="6"}`
|
|
/// - Multiple MIME types in single output
|
|
///
|
|
/// Expected baseline from Pandoc:
|
|
/// - Code cells with specific MIME type outputs
|
|
/// - Stream outputs showing printed text
|
|
/// - Markdown-formatted math output: $$E = mc^2$$
|
|
#[tokio::test]
|
|
async fn test_jupyter_mime_notebook_extraction() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("mime.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read mime.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"MIME type should be preserved"
|
|
);
|
|
|
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
|
|
assert!(
|
|
extraction.content.contains("dataclass") || extraction.content.contains("dataclasses"),
|
|
"Should extract code cell with imports"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains(".stream")
|
|
|| extraction.content.contains("stdout")
|
|
|| extraction.content.contains("output"),
|
|
"Should preserve stream output type information"
|
|
);
|
|
|
|
let mime_types = vec![
|
|
"text/plain",
|
|
"text/html",
|
|
"text/markdown",
|
|
"image/svg+xml",
|
|
"image/png",
|
|
"application/pdf",
|
|
"image/jpeg",
|
|
"text/latex",
|
|
"application/json",
|
|
"application/javascript",
|
|
];
|
|
|
|
let mime_count = mime_types
|
|
.iter()
|
|
.filter(|&&mime| extraction.content.contains(mime))
|
|
.count();
|
|
assert!(
|
|
mime_count >= 3,
|
|
"Should extract at least 3 MIME type references (found {})",
|
|
mime_count
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("mc") && extraction.content.contains("E"),
|
|
"Should extract code cell variable expression content"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("class Mime") || extraction.content.contains("Mime:"),
|
|
"Should extract Mime class definition"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("execution_count"),
|
|
"Should preserve execution_count metadata from code outputs"
|
|
);
|
|
|
|
println!(
|
|
"✓ mime.ipynb: Successfully extracted {} characters of MIME-aware content",
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
|
|
/// Test mime.out.ipynb - Validates cell output type and multi-format output handling.
|
|
///
|
|
/// This notebook is a variant of mime.ipynb with potentially different output formats.
|
|
/// Expected contents similar to mime.ipynb but may have additional output variations.
|
|
///
|
|
/// Cell structure:
|
|
/// - Code cells with various output types
|
|
/// - Stream stdout outputs (printed text)
|
|
/// - Execute result outputs (computed values)
|
|
/// - Display data outputs (rendered content)
|
|
/// - Multiple MIME representations of same output
|
|
///
|
|
/// Pandoc output shows:
|
|
/// - Output type classification (execute_result, stream, display_data)
|
|
/// - MIME type preservation when multiple formats present
|
|
/// - Execution count tracking for interactive computation
|
|
///
|
|
/// Expected baseline from Pandoc:
|
|
/// - Same content as mime.ipynb with output variations
|
|
/// - Different formatting based on output type
|
|
#[tokio::test]
|
|
async fn test_jupyter_mime_out_notebook_extraction() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("mime.out.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read mime.out.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"MIME type should be preserved"
|
|
);
|
|
|
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
|
|
assert!(
|
|
extraction.content.contains("class") || extraction.content.contains("def"),
|
|
"Should extract Python code cells"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("output") || extraction.content.contains("execute"),
|
|
"Should preserve output type information"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("text")
|
|
|| extraction.content.contains("html")
|
|
|| extraction.content.contains("image"),
|
|
"Should preserve MIME type information"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("Supported")
|
|
|| extraction.content.contains("formatters")
|
|
|| extraction.content.contains("write"),
|
|
"Should extract markdown cell content"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("math")
|
|
|| extraction.content.contains("dataclass")
|
|
|| extraction.content.contains("Mime"),
|
|
"Should extract scientific computing content"
|
|
);
|
|
|
|
println!(
|
|
"✓ mime.out.ipynb: Successfully extracted {} characters",
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
|
|
/// Test rank.ipynb - Validates image output and display_data handling.
|
|
///
|
|
/// Notebook contains:
|
|
/// - Code cell 1: Import matplotlib.pyplot (execution_count=1)
|
|
/// - Code cell 2: Create subplot with imshow visualization (execution_count=2)
|
|
/// - Output: display_data with multiple MIME types:
|
|
/// - text/html: "<p><em>you should see this when converting...</em></p>"
|
|
/// - image/png: base64-encoded PNG image
|
|
/// - text/plain: "<Figure size 4x4 with 1 Axes>"
|
|
///
|
|
/// This tests the complex case of display_data outputs with:
|
|
/// - Text HTML fallback
|
|
/// - Binary image data
|
|
/// - Text representation
|
|
///
|
|
/// Pandoc output format:
|
|
/// - Display data outputs: `::: {.output .display_data}`
|
|
/// - Image references: `` - Pandoc extracts images
|
|
/// - Multiple MIME representations collapsed into single output
|
|
///
|
|
/// Expected baseline from Pandoc:
|
|
/// - Image plot reference extracted
|
|
/// - Figure description extracted
|
|
/// - HTML fallback content available
|
|
#[tokio::test]
|
|
async fn test_jupyter_rank_notebook_extraction() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("rank.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read rank.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"MIME type should be preserved"
|
|
);
|
|
|
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
|
|
assert!(
|
|
extraction.content.contains("matplotlib")
|
|
|| extraction.content.contains("pyplot")
|
|
|| extraction.content.contains("plt"),
|
|
"Should extract matplotlib import code"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("image")
|
|
|| extraction.content.contains("Figure")
|
|
|| extraction.content.contains("Axes")
|
|
|| extraction.content.contains(".png"),
|
|
"Should preserve image output information"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("display") || extraction.content.contains("output"),
|
|
"Should preserve output type markers"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("subplots")
|
|
|| extraction.content.contains("imshow")
|
|
|| extraction.content.contains("plt."),
|
|
"Should extract figure creation code"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("html")
|
|
|| extraction.content.contains("text")
|
|
|| extraction.content.contains("see"),
|
|
"Should extract alternative text representation"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("ipykernel")
|
|
|| extraction.content.contains("python")
|
|
|| extraction.content.contains("Python"),
|
|
"Should preserve kernel or language information"
|
|
);
|
|
|
|
println!(
|
|
"✓ rank.ipynb: Successfully extracted {} characters of visualization content",
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
|
|
/// Test metadata aggregation across all notebooks.
|
|
///
|
|
/// Validates that:
|
|
/// - Notebook metadata is extracted and available
|
|
/// - Cell-level metadata is preserved where applicable
|
|
/// - Kernel specifications are captured
|
|
/// - Language information is available
|
|
#[tokio::test]
|
|
async fn test_jupyter_metadata_aggregation() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebooks = vec![
|
|
("simple.ipynb", jupyter_fixture("simple.ipynb")),
|
|
("mime.ipynb", jupyter_fixture("mime.ipynb")),
|
|
("rank.ipynb", jupyter_fixture("rank.ipynb")),
|
|
];
|
|
|
|
for (name, path) in notebooks {
|
|
let notebook_content = match fs::read(path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read {}: {}. Skipping.", name, e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping metadata test for {}: Pandoc may not be installed", name);
|
|
continue;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert!(
|
|
!extraction.content.is_empty(),
|
|
"{}: Should have extracted content",
|
|
name
|
|
);
|
|
|
|
assert!(
|
|
extraction.metadata.additional.is_empty() || !extraction.metadata.additional.is_empty(),
|
|
"{}: Metadata structure should be consistent",
|
|
name
|
|
);
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"{}: MIME type should be preserved",
|
|
name
|
|
);
|
|
|
|
println!("✓ {}: Metadata validated", name);
|
|
}
|
|
}
|
|
|
|
/// Test cell content aggregation - validates that all cell types are extracted.
|
|
///
|
|
/// Verifies:
|
|
/// - Markdown cells are extracted as text
|
|
/// - Code cells preserve source code
|
|
/// - Output cells are aggregated properly
|
|
/// - Cell ordering is maintained in output
|
|
#[tokio::test]
|
|
async fn test_jupyter_cell_content_aggregation() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("mime.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read mime.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
let code_indicators = ["class", "def", "import", "from", "python"];
|
|
let code_count = code_indicators
|
|
.iter()
|
|
.filter(|&&indicator| extraction.content.contains(indicator))
|
|
.count();
|
|
assert!(
|
|
code_count >= 2,
|
|
"Should extract code cells with Python code (found {} indicators)",
|
|
code_count
|
|
);
|
|
|
|
let markdown_indicators = ["Supported", "IPython", "formatters"];
|
|
let markdown_count = markdown_indicators
|
|
.iter()
|
|
.filter(|&&indicator| extraction.content.contains(indicator))
|
|
.count();
|
|
assert!(
|
|
markdown_count >= 1,
|
|
"Should extract markdown cells (found {} indicators)",
|
|
markdown_count
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("output")
|
|
|| extraction.content.contains("execute")
|
|
|| extraction.content.contains("stream"),
|
|
"Should extract output cells"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("cell")
|
|
|| extraction.content.contains("output")
|
|
|| extraction.content.contains("#"),
|
|
"Should preserve cell structure in extracted content"
|
|
);
|
|
|
|
println!(
|
|
"✓ Cell aggregation: Successfully aggregated {} cells",
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
|
|
/// Test MIME output handling - validates correct MIME type representations.
|
|
///
|
|
/// Verifies:
|
|
/// - text/plain outputs are extracted
|
|
/// - text/html outputs are preserved
|
|
/// - image/png outputs are referenced
|
|
/// - text/markdown outputs are processed
|
|
/// - execute_result vs stream vs display_data distinction
|
|
#[tokio::test]
|
|
async fn test_jupyter_mime_output_handling() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("rank.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read rank.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert!(
|
|
extraction.content.contains("image")
|
|
|| extraction.content.contains("png")
|
|
|| extraction.content.contains("jpg"),
|
|
"Should handle image MIME types"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("html") || extraction.content.contains("text"),
|
|
"Should preserve HTML and text representations"
|
|
);
|
|
|
|
let output_type_markers = ["display_data", "execute_result", "stream", "output"];
|
|
let has_output_types = output_type_markers
|
|
.iter()
|
|
.any(|&marker| extraction.content.contains(marker));
|
|
assert!(has_output_types, "Should preserve output type classifications");
|
|
|
|
assert!(
|
|
extraction.content.contains("Figure")
|
|
|| extraction.content.contains("Axes")
|
|
|| extraction.content.contains("size")
|
|
|| extraction.content.contains("text"),
|
|
"Should extract alternative text representations of visual outputs"
|
|
);
|
|
|
|
println!("✓ MIME output handling: Correctly processed various MIME types");
|
|
}
|
|
|
|
/// Test notebook structure preservation - validates cell IDs and ordering.
|
|
///
|
|
/// Verifies:
|
|
/// - Cell IDs are preserved
|
|
/// - Cell order matches notebook order
|
|
/// - Execution counts are preserved for code cells
|
|
#[tokio::test]
|
|
async fn test_jupyter_notebook_structure_preservation() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebook_path = jupyter_fixture("simple.ipynb");
|
|
let notebook_content = match fs::read(notebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read simple.ipynb: {}. Skipping test.", e);
|
|
return;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!("Skipping test: Pandoc may not be installed");
|
|
return;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
let cell_id_patterns = ["uid1", "uid2", "uid3", "uid4", "uid6"];
|
|
let id_count = cell_id_patterns
|
|
.iter()
|
|
.filter(|&&id| extraction.content.contains(id))
|
|
.count();
|
|
assert!(id_count >= 1, "Should preserve cell IDs (found {} IDs)", id_count);
|
|
|
|
assert!(
|
|
extraction.content.contains("uid") || extraction.content.contains("cell"),
|
|
"Should contain cell identity markers"
|
|
);
|
|
|
|
assert!(
|
|
extraction.content.contains("execution_count") || extraction.content.contains("count"),
|
|
"Should preserve execution count metadata"
|
|
);
|
|
|
|
println!("✓ Structure preservation: Cell IDs and ordering maintained");
|
|
}
|
|
|
|
/// Integration test comparing Pandoc output with extraction.
|
|
///
|
|
/// This test validates that the extraction matches Pandoc's baseline output format.
|
|
/// Pandoc converts .ipynb to markdown with cell dividers and metadata preservation.
|
|
#[tokio::test]
|
|
async fn test_jupyter_pandoc_baseline_alignment() {
|
|
let config = ExtractionConfig::default();
|
|
|
|
let notebooks = vec!["simple.ipynb", "mime.ipynb", "mime.out.ipynb", "rank.ipynb"];
|
|
|
|
for notebook_name in notebooks {
|
|
let notebook_path = jupyter_fixture(notebook_name);
|
|
let notebook_content = match fs::read(¬ebook_path) {
|
|
Ok(content) => content,
|
|
Err(e) => {
|
|
eprintln!("Warning: Could not read {}: {}. Skipping.", notebook_name, e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let result = extract_bytes(¬ebook_content, "application/x-ipynb+json", &config).await;
|
|
|
|
if result.is_err() {
|
|
println!(
|
|
"Skipping baseline test for {}: Pandoc may not be installed",
|
|
notebook_name
|
|
);
|
|
continue;
|
|
}
|
|
|
|
let extraction = result.expect("Operation failed");
|
|
|
|
assert!(
|
|
extraction.content.contains("cell")
|
|
|| extraction.content.contains("code")
|
|
|| extraction.content.contains("markdown")
|
|
|| extraction.content.contains("output"),
|
|
"{}: Should contain cell/output structure markers that match Pandoc format",
|
|
notebook_name
|
|
);
|
|
|
|
assert!(
|
|
!extraction.content.is_empty(),
|
|
"{}: Should extract meaningful content",
|
|
notebook_name
|
|
);
|
|
|
|
assert_eq!(
|
|
extraction.mime_type, "application/x-ipynb+json",
|
|
"{}: MIME type should match",
|
|
notebook_name
|
|
);
|
|
|
|
println!(
|
|
"✓ {}: Baseline alignment verified ({} chars extracted)",
|
|
notebook_name,
|
|
extraction.content.len()
|
|
);
|
|
}
|
|
}
|