Files
fil/crates/kreuzberg/tests/typst_extractor_tests.rs

646 lines
20 KiB
Rust
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
//! Comprehensive TDD test suite for Typst document extraction.
//!
//! This test suite validates Typst document extraction against expected outputs.
//! The tests verify:
//! - Document metadata extraction (title, author, date, keywords)
//! - Heading hierarchy parsing (=, ==, ===, etc.)
//! - Inline formatting (bold, italic, code)
//! - Table extraction and parsing
//! - List handling (ordered and unordered)
//! - Link extraction
//! - Mathematical notation preservation
//!
//! Each test document is extracted and validated for correct content extraction.
#![cfg(feature = "office")]
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::extractor::extract_bytes;
use std::{fs, path::PathBuf};
fn typst_fixture(name: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../../test_documents/typst")
.join(name)
}
/// Test simple.typ - Basic Typst document with fundamental formatting
///
/// Document contains:
/// - Document metadata: title, author, date
/// - Level 1 heading: "Introduction"
/// - Level 2 headings: "Subsection", "Features", "Lists", "Code", "Tables", "Links", "Conclusion"
/// - Inline formatting: *bold*, _italic_, `inline code`
/// - Unordered list with 3 items
/// - Code snippet
/// - 2x2 table with headers
/// - Link to Typst website
///
/// Expected: Document should extract text, preserve headings, metadata, and formatting markers
#[tokio::test]
async fn test_simple_typst_document_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("simple.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
assert!(
extraction.metadata.title.is_some(),
"Document title should be extracted from #set document()"
);
assert!(
extraction.metadata.authors.is_some(),
"Document author should be extracted"
);
assert!(
extraction.content.contains("Introduction"),
"Should extract 'Introduction' heading"
);
assert!(
extraction.content.contains("Features"),
"Should extract 'Features' heading"
);
assert!(
extraction.content.contains("Conclusion"),
"Should extract 'Conclusion' heading"
);
let intro_count = extraction.content.matches("= Introduction").count();
let subsection_count = extraction.content.matches("== Subsection").count();
let features_count = extraction.content.matches("= Features").count();
let lists_count = extraction.content.matches("== Lists").count();
let code_count = extraction.content.matches("== Code").count();
let tables_count = extraction.content.matches("== Tables").count();
let links_count = extraction.content.matches("== Links").count();
let conclusion_count = extraction.content.matches("= Conclusion").count();
assert_eq!(intro_count, 1, "Should extract 'Introduction' (level 1)");
assert_eq!(subsection_count, 1, "Should extract 'Subsection' (level 2)");
assert_eq!(features_count, 1, "Should extract 'Features' (level 1)");
assert_eq!(lists_count, 1, "Should extract 'Lists' (level 2)");
assert_eq!(code_count, 1, "Should extract 'Code' (level 2)");
assert_eq!(tables_count, 1, "Should extract 'Tables' (level 2)");
assert_eq!(links_count, 1, "Should extract 'Links' (level 2)");
assert_eq!(conclusion_count, 1, "Should extract 'Conclusion' (level 1)");
assert!(
extraction.content.contains("*") || extraction.content.contains("bold"),
"Should preserve bold formatting or text"
);
assert!(
extraction.content.contains("-") || extraction.content.contains("First") || extraction.content.contains("item"),
"Should extract list content"
);
println!(
"✓ simple.typ: Successfully extracted {} characters with all 8 headings",
extraction.content.len()
);
}
/// Test minimal.typ - Minimal Typst document
///
/// Document contains:
/// - Single level 1 heading: "Hello World"
/// - Simple text content
///
/// Expected: Basic heading and content extraction
#[tokio::test]
async fn test_minimal_typst_document_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("minimal.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read minimal.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "application/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(
!extraction.content.is_empty(),
"Minimal document should extract content"
);
assert!(
extraction.content.contains("Hello") || extraction.content.contains("World"),
"Should extract heading content"
);
println!(
"✓ minimal.typ: Successfully extracted {} characters",
extraction.content.len()
);
}
/// Test headings.typ - Document focusing on heading hierarchy
///
/// Document contains:
/// - 6 heading levels (=, ==, ===, ====, =====, ======)
/// - Content under each heading level
///
/// Expected: Heading structure should be preserved with level information
#[tokio::test]
async fn test_heading_hierarchy_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("headings.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read headings.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(!extraction.content.is_empty(), "Document should extract content");
assert!(
extraction.content.contains("= Level 1") || extraction.content.contains("Level 1 Heading"),
"Should extract level 1 heading"
);
assert!(
extraction.content.contains("== Level 2") || extraction.content.contains("Level 2 Heading"),
"Should extract level 2 heading"
);
assert!(
extraction.content.contains("=== Level 3") || extraction.content.contains("Level 3 Heading"),
"Should extract level 3 heading"
);
assert!(
extraction.content.contains("==== Level 4") || extraction.content.contains("Level 4 Heading"),
"Should extract level 4 heading"
);
assert!(
extraction.content.contains("===== Level 5") || extraction.content.contains("Level 5 Heading"),
"Should extract level 5 heading"
);
assert!(
extraction.content.contains("====== Level 6") || extraction.content.contains("Level 6 Heading"),
"Should extract level 6 heading"
);
let level_1_count = extraction.content.matches("= Level 1").count();
let level_2_count = extraction.content.matches("== Level 2").count();
let level_3_count = extraction.content.matches("=== Level 3").count();
let level_4_count = extraction.content.matches("==== Level 4").count();
let level_5_count = extraction.content.matches("===== Level 5").count();
let level_6_count = extraction.content.matches("====== Level 6").count();
assert_eq!(level_1_count, 1, "Should extract exactly one level 1 heading");
assert_eq!(level_2_count, 1, "Should extract exactly one level 2 heading");
assert_eq!(level_3_count, 1, "Should extract exactly one level 3 heading");
assert_eq!(level_4_count, 1, "Should extract exactly one level 4 heading");
assert_eq!(level_5_count, 1, "Should extract exactly one level 5 heading");
assert_eq!(level_6_count, 1, "Should extract exactly one level 6 heading");
println!(
"✓ headings.typ: Successfully extracted {} characters with heading structure",
extraction.content.len()
);
}
/// Test metadata.typ - Document with comprehensive metadata
///
/// Document contains:
/// - #set document() with: title, author, subject, keywords
/// - Content sections
///
/// Expected: All metadata fields should be extracted correctly
#[tokio::test]
async fn test_metadata_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("metadata.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read metadata.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "application/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
if let Some(title) = extraction.metadata.additional.get("title") {
assert!(
title.to_string().contains("Metadata") || title.to_string().contains("Example"),
"Title should contain expected text"
);
}
if let Some(author) = extraction.metadata.additional.get("author") {
assert!(
author.to_string().contains("John") || author.to_string().contains("Doe"),
"Author should contain expected text"
);
}
if let Some(keywords) = &extraction.metadata.keywords {
assert!(!keywords.is_empty(), "Keywords should be present");
}
assert!(!extraction.content.is_empty(), "Document should extract content");
println!(
"✓ metadata.typ: Successfully extracted metadata and {} characters of content",
extraction.content.len()
);
}
/// Test advanced.typ - Complex Typst document with multiple features
///
/// Document contains:
/// - Metadata: title, author, keywords, date
/// - Heading numbering configuration
/// - Mathematical notation (inline and display)
/// - Nested heading levels (level 1, 2, 3, 4)
/// - Code blocks (Python example)
/// - Complex tables with 3 columns and 4 rows
/// - Multiple paragraph sections
/// - Links with text
/// - Multiple formatting combinations
///
/// Expected: Comprehensive extraction of all document elements
#[tokio::test]
async fn test_advanced_typst_document_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("advanced.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read advanced.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(extraction.metadata.title.is_some(), "Title should be extracted");
assert!(
!extraction.content.is_empty(),
"Advanced document should extract content"
);
assert!(
extraction.content.contains("$")
|| extraction.content.contains("equation")
|| extraction.content.contains("math"),
"Should extract or preserve mathematical notation"
);
assert!(
extraction.content.contains("Mathematical")
|| extraction.content.contains("Formatting")
|| extraction.content.contains("Features"),
"Should extract section headings"
);
assert!(
extraction.content.contains("python")
|| extraction.content.contains("def")
|| extraction.content.contains("fibonacci")
|| extraction.content.contains("```"),
"Should extract code block content"
);
let level_count = extraction.content.matches("=").count();
assert!(level_count >= 3, "Should preserve nested heading hierarchy");
assert!(
extraction.content.contains("Name")
|| extraction.content.contains("Alice")
|| extraction.content.contains("Table"),
"Should extract table content"
);
assert!(
extraction.content.contains("example")
|| extraction.content.contains("link")
|| extraction.content.contains("http"),
"Should extract link content"
);
println!(
"✓ advanced.typ: Successfully extracted {} characters with complex formatting",
extraction.content.len()
);
}
/// Test typst-reader.typ - Pandoc test file
///
/// Document from Pandoc test suite demonstrating Typst reader functionality
///
/// Expected: Proper extraction of Typst-specific syntax
#[tokio::test]
async fn test_typst_reader_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("typst-reader.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read typst-reader.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "application/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(
!extraction.content.is_empty(),
"Should extract content from Pandoc test file"
);
assert!(
extraction.content.contains("=") || extraction.content.contains("Fibonacci"),
"Should extract heading or content from test file"
);
println!(
"✓ typst-reader.typ: Successfully extracted {} characters",
extraction.content.len()
);
}
/// Test undergradmath.typ - Pandoc test file with complex math
///
/// Document from Pandoc test suite with extensive mathematical notation
/// and complex formatting
///
/// Expected: Handling of complex Typst syntax with metadata and content
#[tokio::test]
async fn test_undergradmath_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("undergradmath.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(
!extraction.content.is_empty(),
"Should extract content from complex math document"
);
if let Some(title) = extraction.metadata.additional.get("title") {
assert!(!title.to_string().is_empty(), "Title should be extracted");
}
assert!(
extraction.content.contains("=") || extraction.content.contains("Typst") || extraction.content.len() > 100,
"Should extract document structure or content"
);
println!(
"✓ undergradmath.typ: Successfully extracted {} characters from math document",
extraction.content.len()
);
}
/// Test MIME type detection and fallback
///
/// Verifies that Typst documents can be extracted with different MIME type specifications
#[tokio::test]
async fn test_typst_mime_type_variants() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("simple.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
return;
}
};
let mime_types = vec!["application/x-typst", "text/x-typst", "text/plain"];
for mime_type in mime_types {
let result = extract_bytes(&content, mime_type, &config).await;
if let Ok(extraction) = result {
assert!(
!extraction.content.is_empty(),
"Should extract content with MIME type: {}",
mime_type
);
println!(
"✓ MIME type '{}': Successfully extracted {} characters",
mime_type,
extraction.content.len()
);
}
}
}
/// Test formatting preservation
///
/// Validates that inline formatting markers are preserved in extracted content
#[tokio::test]
async fn test_formatting_preservation() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("simple.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(
extraction.content.contains("*") || extraction.content.contains("bold"),
"Should preserve bold formatting or text"
);
assert!(
extraction.content.contains("_") || extraction.content.contains("italic"),
"Should preserve italic formatting or text"
);
assert!(
extraction.content.contains("`") || extraction.content.contains("code"),
"Should preserve code formatting or text"
);
println!("✓ Formatting preservation: All markers/content found in extracted text");
}
/// Test large document handling
///
/// Validates extraction of the large undergradmath document
#[tokio::test]
async fn test_large_document_extraction() {
let config = ExtractionConfig::default();
let doc_path = typst_fixture("undergradmath.typ");
let content = match fs::read(doc_path) {
Ok(c) => c,
Err(e) => {
eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
return;
}
};
let result = extract_bytes(&content, "text/x-typst", &config).await;
if result.is_err() {
println!("Skipping test: Typst extractor may not be available");
return;
}
let extraction = result.expect("Operation failed");
assert!(
!extraction.content.is_empty(),
"Should extract content from large document"
);
println!(
"✓ Large document: Extracted {} bytes of content from source file",
extraction.content.len()
);
}
/// Test empty/whitespace handling
///
/// Validates graceful handling of edge cases
#[tokio::test]
async fn test_empty_content_handling() {
let config = ExtractionConfig::default();
let empty_content = b"";
let result = extract_bytes(empty_content, "text/x-typst", &config).await;
match result {
Ok(extraction) => {
println!(
"✓ Empty content: Handled gracefully, extracted {} bytes",
extraction.content.len()
);
}
Err(e) => {
println!("✓ Empty content: Resulted in expected error: {}", e);
}
}
}
/// Test MIME type priority
///
/// Validates that Typst extractor has correct priority (50)
#[tokio::test]
async fn test_typst_extractor_priority() {
use kreuzberg::extractors::TypstExtractor;
use kreuzberg::plugins::DocumentExtractor;
let extractor = TypstExtractor;
let priority = extractor.priority();
assert_eq!(priority, 50, "Typst extractor should have priority 50");
println!("✓ Typst extractor priority: {}", priority);
}
/// Test supported MIME types
///
/// Validates that extractor claims to support Typst MIME types
#[tokio::test]
async fn test_supported_mime_types() {
use kreuzberg::extractors::TypstExtractor;
use kreuzberg::plugins::DocumentExtractor;
let extractor = TypstExtractor;
let mime_types = extractor.supported_mime_types();
assert!(
mime_types.contains(&"application/x-typst"),
"Should support application/x-typst"
);
assert!(mime_types.contains(&"text/x-typst"), "Should support text/x-typst");
println!("✓ Supported MIME types: {:?}", mime_types);
}