134 lines
4.7 KiB
Rust
134 lines
4.7 KiB
Rust
//! Integration tests for image path resolution in markup extractors.
|
|
|
|
use kreuzberg::ExtractionConfig;
|
|
use kreuzberg::ImageExtractionConfig;
|
|
use std::path::PathBuf;
|
|
|
|
fn fixtures_dir() -> PathBuf {
|
|
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/path_resolution/fixtures")
|
|
}
|
|
|
|
fn config_with_images() -> ExtractionConfig {
|
|
ExtractionConfig {
|
|
images: Some(ImageExtractionConfig {
|
|
extract_images: true,
|
|
..Default::default()
|
|
}),
|
|
..Default::default()
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_markdown_resolves_images() {
|
|
let path = fixtures_dir().join("markdown_with_images.md");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, None, &config).await.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
// Should resolve the 2 local images but NOT the https:// URL
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
|
|
// Verify image data is non-empty
|
|
for img in images {
|
|
assert!(!img.data.is_empty(), "image data should not be empty");
|
|
assert_eq!(img.format, "png");
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_markdown_bytes_no_resolution() {
|
|
let path = fixtures_dir().join("markdown_with_images.md");
|
|
let content = std::fs::read(&path).unwrap();
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_bytes(&content, "text/markdown", &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
// extract_bytes has no file path context, so no image resolution should happen
|
|
let image_count = result.images.as_ref().map_or(0, |imgs| imgs.len());
|
|
assert_eq!(image_count, 0, "extract_bytes should not resolve local images");
|
|
}
|
|
|
|
#[cfg(feature = "office")]
|
|
#[tokio::test]
|
|
async fn test_latex_resolves_images() {
|
|
let path = fixtures_dir().join("latex_with_images.tex");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, None, &config).await.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
}
|
|
|
|
#[cfg(feature = "office")]
|
|
#[tokio::test]
|
|
async fn test_rst_resolves_images() {
|
|
let path = fixtures_dir().join("rst_with_images.rst");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, Some("text/x-rst"), &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
}
|
|
|
|
#[cfg(feature = "office")]
|
|
#[tokio::test]
|
|
async fn test_orgmode_resolves_images() {
|
|
let path = fixtures_dir().join("orgmode_with_images.org");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, Some("text/x-org"), &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
}
|
|
|
|
#[cfg(feature = "office")]
|
|
#[tokio::test]
|
|
async fn test_typst_resolves_images() {
|
|
let path = fixtures_dir().join("typst_with_images.typ");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, Some("application/x-typst"), &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_djot_resolves_images() {
|
|
let path = fixtures_dir().join("djot_with_images.djot");
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&path, Some("text/djot"), &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
let images = result.images.as_ref().expect("should have images");
|
|
assert_eq!(images.len(), 2, "expected 2 resolved images, got {}", images.len());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_traversal_blocked() {
|
|
// Create a temp markdown file that references a traversal path
|
|
let tmp_dir = std::env::temp_dir().join("kreuzberg_path_test");
|
|
std::fs::create_dir_all(&tmp_dir).unwrap();
|
|
let md_path = tmp_dir.join("traversal.md");
|
|
std::fs::write(&md_path, "\n\n").unwrap();
|
|
|
|
let config = config_with_images();
|
|
let result = kreuzberg::extract_file(&md_path, Some("text/markdown"), &config)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Neither should resolve: traversal is blocked, and images/ doesn't exist in tmp
|
|
let image_count = result.images.as_ref().map_or(0, |imgs| imgs.len());
|
|
assert_eq!(image_count, 0, "traversal paths should not resolve to images");
|
|
|
|
// Cleanup
|
|
let _ = std::fs::remove_dir_all(&tmp_dir);
|
|
}
|