This commit is contained in:
201
crates/kreuzberg/tests/fictionbook_extractor_tests.rs
Normal file
201
crates/kreuzberg/tests/fictionbook_extractor_tests.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
#![cfg(feature = "office")]
|
||||
|
||||
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Helper to get absolute path to test documents
|
||||
fn test_file_path(filename: &str) -> PathBuf {
|
||||
let manifest_dir = env!("CARGO_MANIFEST_DIR");
|
||||
PathBuf::from(manifest_dir)
|
||||
.parent()
|
||||
.expect("Operation failed")
|
||||
.parent()
|
||||
.expect("Operation failed")
|
||||
.join("test_documents")
|
||||
.join("fictionbook")
|
||||
.join(filename)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_metadata_title() {
|
||||
let path = test_file_path("meta.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(
|
||||
result.content.contains("Book title"),
|
||||
"Book title should be extracted from FB2 content"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_metadata_genre() {
|
||||
let path = test_file_path("meta.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(result.metadata.subject.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_content_sections() {
|
||||
let path = test_file_path("titles.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(
|
||||
result.content.contains("Simple title"),
|
||||
"Section titles should be extracted"
|
||||
);
|
||||
assert!(
|
||||
result.content.contains("Emphasized"),
|
||||
"Section with emphasis should be extracted"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_section_hierarchy() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(
|
||||
result.content.contains("Top-level title"),
|
||||
"Top-level section should be extracted"
|
||||
);
|
||||
assert!(result.content.contains("Section"), "Nested section should be extracted");
|
||||
assert!(
|
||||
result.content.contains("Subsection"),
|
||||
"Nested subsection should be extracted"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_inline_markup() {
|
||||
let path = test_file_path("emphasis.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
let content = result.content.to_lowercase();
|
||||
assert!(content.contains("plain"), "Plain text should be extracted");
|
||||
assert!(content.contains("strong"), "Strong emphasis should be extracted");
|
||||
assert!(content.contains("emphasis"), "Emphasis should be extracted");
|
||||
assert!(content.contains("strikethrough"), "Strikethrough should be extracted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_emphasis() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(
|
||||
result.content.contains("emphasized"),
|
||||
"Emphasized text should be extracted"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_strong() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(result.content.contains("strong"), "Strong text should be extracted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_code() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(result.content.contains("verbatim"), "Code content should be extracted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_blockquote() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(result.content.contains("Blockquote"), "Blockquote should be extracted");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_extract_tables() {
|
||||
let path = test_file_path("tables.fb2");
|
||||
let result = kreuzberg::extract_file(&path, None, &ExtractionConfig::default())
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
assert!(
|
||||
!result.content.is_empty(),
|
||||
"Content should be extracted from file with tables"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_markdown_formatting_preservation() {
|
||||
let path = test_file_path("emphasis.fb2");
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
let result = kreuzberg::extract_file(&path, None, &config)
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
let md = &result.content;
|
||||
assert!(
|
||||
md.contains("**strong**"),
|
||||
"Strong text should be formatted as **bold** in markdown"
|
||||
);
|
||||
assert!(
|
||||
md.contains("*emphasis*"),
|
||||
"Emphasis text should be formatted as *italic* in markdown"
|
||||
);
|
||||
assert!(
|
||||
md.contains("~~deleted~~"),
|
||||
"Strikethrough text should be formatted as ~~strikethrough~~ in markdown"
|
||||
);
|
||||
assert!(
|
||||
md.contains("`code`"),
|
||||
"Code text should be wrapped in backticks in markdown"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fictionbook_formatting_in_body_paragraphs() {
|
||||
let path = test_file_path("basic.fb2");
|
||||
let config = ExtractionConfig {
|
||||
output_format: OutputFormat::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
let result = kreuzberg::extract_file(&path, None, &config)
|
||||
.await
|
||||
.expect("Failed to extract FB2 file");
|
||||
|
||||
let md = &result.content;
|
||||
assert!(
|
||||
md.contains("*emphasized*"),
|
||||
"Emphasis formatting should be preserved in body content"
|
||||
);
|
||||
assert!(
|
||||
md.contains("**strong**"),
|
||||
"Strong formatting should be preserved in body content"
|
||||
);
|
||||
assert!(
|
||||
md.contains("`verbatim`"),
|
||||
"Code formatting should be preserved in body content"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user