202 lines
6.2 KiB
Rust
202 lines
6.2 KiB
Rust
#![cfg(feature = "office")]
|
|
|
|
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
|
use std::path::PathBuf;
|
|
|
|
/// Helper to get absolute path to test documents
|
|
fn test_file_path(filename: &str) -> PathBuf {
|
|
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
|
PathBuf::from(manifest_dir)
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.parent()
|
|
.expect("Operation failed")
|
|
.join("test_documents")
|
|
.join("fictionbook")
|
|
.join(filename)
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_metadata_title() {
|
|
let path = test_file_path("meta.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(
|
|
result.content.contains("Book title"),
|
|
"Book title should be extracted from FB2 content"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_metadata_genre() {
|
|
let path = test_file_path("meta.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(result.metadata.subject.is_none());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_content_sections() {
|
|
let path = test_file_path("titles.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(
|
|
result.content.contains("Simple title"),
|
|
"Section titles should be extracted"
|
|
);
|
|
assert!(
|
|
result.content.contains("Emphasized"),
|
|
"Section with emphasis should be extracted"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_section_hierarchy() {
|
|
let path = test_file_path("basic.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(
|
|
result.content.contains("Top-level title"),
|
|
"Top-level section should be extracted"
|
|
);
|
|
assert!(result.content.contains("Section"), "Nested section should be extracted");
|
|
assert!(
|
|
result.content.contains("Subsection"),
|
|
"Nested subsection should be extracted"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_inline_markup() {
|
|
let path = test_file_path("emphasis.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
let content = result.content.to_lowercase();
|
|
assert!(content.contains("plain"), "Plain text should be extracted");
|
|
assert!(content.contains("strong"), "Strong emphasis should be extracted");
|
|
assert!(content.contains("emphasis"), "Emphasis should be extracted");
|
|
assert!(content.contains("strikethrough"), "Strikethrough should be extracted");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_emphasis() {
|
|
let path = test_file_path("basic.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(
|
|
result.content.contains("emphasized"),
|
|
"Emphasized text should be extracted"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_strong() {
|
|
let path = test_file_path("basic.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(result.content.contains("strong"), "Strong text should be extracted");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_code() {
|
|
let path = test_file_path("basic.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(result.content.contains("verbatim"), "Code content should be extracted");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_blockquote() {
|
|
let path = test_file_path("basic.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(result.content.contains("Blockquote"), "Blockquote should be extracted");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_extract_tables() {
|
|
let path = test_file_path("tables.fb2");
|
|
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
assert!(
|
|
!result.content.is_empty(),
|
|
"Content should be extracted from file with tables"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_markdown_formatting_preservation() {
|
|
let path = test_file_path("emphasis.fb2");
|
|
let config = ExtractionConfig {
|
|
output_format: OutputFormat::Markdown,
|
|
..Default::default()
|
|
};
|
|
let result = kreuzberg::extract_file(&path, None, &config)
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
let md = &result.content;
|
|
assert!(
|
|
md.contains("**strong**"),
|
|
"Strong text should be formatted as **bold** in markdown"
|
|
);
|
|
assert!(
|
|
md.contains("*emphasis*"),
|
|
"Emphasis text should be formatted as *italic* in markdown"
|
|
);
|
|
assert!(
|
|
md.contains("~~deleted~~"),
|
|
"Strikethrough text should be formatted as ~~strikethrough~~ in markdown"
|
|
);
|
|
assert!(
|
|
md.contains("`code`"),
|
|
"Code text should be wrapped in backticks in markdown"
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_fictionbook_formatting_in_body_paragraphs() {
|
|
let path = test_file_path("basic.fb2");
|
|
let config = ExtractionConfig {
|
|
output_format: OutputFormat::Markdown,
|
|
..Default::default()
|
|
};
|
|
let result = kreuzberg::extract_file(&path, None, &config)
|
|
.await
|
|
.expect("Failed to extract FB2 file");
|
|
|
|
let md = &result.content;
|
|
assert!(
|
|
md.contains("*emphasized*"),
|
|
"Emphasis formatting should be preserved in body content"
|
|
);
|
|
assert!(
|
|
md.contains("**strong**"),
|
|
"Strong formatting should be preserved in body content"
|
|
);
|
|
assert!(
|
|
md.contains("`verbatim`"),
|
|
"Code formatting should be preserved in body content"
|
|
);
|
|
}
|