Files
fil/crates/kreuzberg/tests/rendering_integration_tests.rs

405 lines
13 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Integration tests for the new rendering layer.
//!
//! These tests construct `InternalDocument` instances via the builder API,
//! run them through `derive_extraction_result`, and verify the rendered
//! output in each supported `OutputFormat`.
//!
//! Usage:
//! cargo test -p kreuzberg --test rendering_integration_tests
mod helpers;
use kreuzberg::core::config::OutputFormat;
use kreuzberg::extraction::derive::derive_extraction_result;
use kreuzberg::types::document_structure::{AnnotationKind, TextAnnotation};
use kreuzberg::types::internal_builder::InternalDocumentBuilder;
// ============================================================================
// Helpers
// ============================================================================
/// Build a rich document containing a heading, paragraph, list, code block,
/// and table — the structural elements every format must handle.
fn build_rich_document() -> kreuzberg::types::internal::InternalDocument {
let mut b = InternalDocumentBuilder::new("test");
b.push_heading(1, "Main Heading", None, None);
b.push_paragraph("This is a paragraph with some text.", vec![], None, None);
b.push_list(false);
b.push_list_item("First item", false, vec![], None, None);
b.push_list_item("Second item", false, vec![], None, None);
b.push_list_item("Third item", false, vec![], None, None);
b.end_list();
b.push_code("fn main() {\n println!(\"hello\");\n}", Some("rust"), None, None);
b.push_table_from_cells(
&[
vec!["Name".to_string(), "Value".to_string()],
vec!["alpha".to_string(), "1".to_string()],
vec!["beta".to_string(), "2".to_string()],
],
None,
None,
);
b.build()
}
/// Derive an `ExtractionResult` from a document in the given format.
fn derive(
doc: kreuzberg::types::internal::InternalDocument,
format: OutputFormat,
) -> kreuzberg::types::ExtractionResult {
derive_extraction_result(doc, false, format)
}
/// Return the "effective content" — `formatted_content` when present,
/// otherwise the plain-text `content`.
fn effective_content(result: &kreuzberg::types::ExtractionResult) -> &str {
result.formatted_content.as_deref().unwrap_or(&result.content)
}
// ============================================================================
// 1. Markdown output preserves structure
// ============================================================================
#[tokio::test]
async fn test_markdown_output_preserves_structure() {
let doc = build_rich_document();
let result = derive(doc, OutputFormat::Markdown);
let md = effective_content(&result);
// Heading
assert!(
md.contains("# Main Heading"),
"Markdown should contain an ATX heading, got:\n{md}"
);
// Paragraph
assert!(
md.contains("This is a paragraph"),
"Markdown should contain the paragraph text"
);
// List items
assert!(md.contains("First item"), "Markdown should contain list items");
// Code block
assert!(md.contains("```"), "Markdown should contain a fenced code block");
assert!(md.contains("fn main()"), "Markdown code block should contain the code");
// Table (pipe-delimited)
assert!(md.contains('|'), "Markdown should contain pipe-delimited table syntax");
assert!(md.contains("Name"), "Markdown table should contain header cells");
}
// ============================================================================
// 2. Djot output format through pipeline
// ============================================================================
#[tokio::test]
async fn test_djot_output_preserves_structure() {
let doc = build_rich_document();
let result = derive(doc, OutputFormat::Djot);
let djot = effective_content(&result);
// Djot headings use `#` just like markdown
assert!(
djot.contains("# Main Heading"),
"Djot should contain a heading, got:\n{djot}"
);
// Paragraph text
assert!(
djot.contains("This is a paragraph"),
"Djot should contain the paragraph text"
);
// Code (djot uses ``` fences too)
assert!(djot.contains("fn main()"), "Djot should contain the code content");
}
// ============================================================================
// 3. HTML output format through pipeline
// ============================================================================
#[tokio::test]
async fn test_html_output_preserves_structure() {
let doc = build_rich_document();
let result = derive(doc, OutputFormat::Html);
let html = effective_content(&result);
assert!(html.contains("<h1"), "HTML should contain an h1 tag, got:\n{html}");
assert!(html.contains("Main Heading"), "HTML h1 should contain the heading text");
assert!(html.contains("<p"), "HTML should contain paragraph tags");
assert!(html.contains("<li"), "HTML should contain list item tags");
assert!(
html.contains("<code") || html.contains("<pre"),
"HTML should contain code/pre tags"
);
assert!(
html.contains("<table") || html.contains("<th") || html.contains("<td"),
"HTML should contain table markup"
);
}
// ============================================================================
// 4. Plain text output through pipeline
// ============================================================================
#[tokio::test]
async fn test_plain_text_output_has_no_formatting() {
let doc = build_rich_document();
let result = derive(doc, OutputFormat::Plain);
// For plain text the formatted_content should be None
assert!(
result.formatted_content.is_none(),
"Plain format should not set formatted_content"
);
let plain = &result.content;
// Should contain the words
assert!(plain.contains("Main Heading"), "Plain text should contain heading text");
assert!(
plain.contains("This is a paragraph"),
"Plain text should contain paragraph text"
);
assert!(plain.contains("First item"), "Plain text should contain list items");
assert!(plain.contains("fn main()"), "Plain text should contain code content");
// Should NOT contain markdown/html formatting
assert!(!plain.contains("<h1"), "Plain text should not contain HTML tags");
assert!(
!plain.lines().any(|l| l.starts_with("# ")),
"Plain text should not contain markdown heading syntax"
);
}
// ============================================================================
// 5. Format switching consistency
// ============================================================================
#[tokio::test]
async fn test_format_switching_consistency() {
// Render the same logical document to every format and check that the
// plain-text words are present in all of them.
let formats = [
OutputFormat::Plain,
OutputFormat::Markdown,
OutputFormat::Djot,
OutputFormat::Html,
];
let expected_words = [
"Main Heading",
"paragraph",
"First item",
"Second item",
"Third item",
"fn main()",
"alpha",
"beta",
];
for format in &formats {
let doc = build_rich_document();
let result = derive(doc, format.clone());
let text = effective_content(&result);
for word in &expected_words {
assert!(
text.contains(word),
"Format {format:?} should contain \"{word}\" but output was:\n{text}"
);
}
}
}
// ============================================================================
// 6. Footnote rendering end-to-end
// ============================================================================
#[tokio::test]
async fn test_footnote_rendering_end_to_end() {
let mut b = InternalDocumentBuilder::new("test");
b.push_paragraph("Text with a footnote reference.", vec![], None, None);
b.push_footnote_ref("1", "fn1", None);
b.push_paragraph("More text after the reference.", vec![], None, None);
b.push_footnote_definition("This is the footnote content.", "fn1", None);
let doc = b.build();
let result = derive(doc, OutputFormat::Markdown);
let md = effective_content(&result);
// The footnote reference marker should appear
assert!(
md.contains("[^") || md.contains("fn1") || md.contains("[1]"),
"Markdown should contain a footnote reference marker, got:\n{md}"
);
// The footnote definition content should appear
assert!(
md.contains("This is the footnote content"),
"Markdown should contain the footnote definition text, got:\n{md}"
);
}
// ============================================================================
// 7. Annotation rendering end-to-end
// ============================================================================
#[tokio::test]
async fn test_annotation_rendering_end_to_end() {
let mut b = InternalDocumentBuilder::new("test");
// "Hello bold world" with "bold" annotated as Bold (bytes 6..10)
let bold_text = "Hello bold world";
b.push_paragraph(
bold_text,
vec![TextAnnotation {
start: 6,
end: 10,
kind: AnnotationKind::Bold,
}],
None,
None,
);
// "Some italic text" with "italic" annotated as Italic (bytes 5..11)
let italic_text = "Some italic text";
b.push_paragraph(
italic_text,
vec![TextAnnotation {
start: 5,
end: 11,
kind: AnnotationKind::Italic,
}],
None,
None,
);
// "Click here for info" with "here" as a Link (bytes 6..10)
let link_text = "Click here for info";
b.push_paragraph(
link_text,
vec![TextAnnotation {
start: 6,
end: 10,
kind: AnnotationKind::Link {
url: "https://example.com".to_string(),
title: None,
},
}],
None,
None,
);
let doc = b.build();
let result = derive(doc, OutputFormat::Markdown);
let md = effective_content(&result);
// Bold: **bold**
assert!(
md.contains("**bold**"),
"Markdown should render bold annotation as **bold**, got:\n{md}"
);
// Italic: *italic*
assert!(
md.contains("*italic*"),
"Markdown should render italic annotation as *italic*, got:\n{md}"
);
// Link: [here](https://example.com)
assert!(
md.contains("[here](https://example.com)"),
"Markdown should render link annotation, got:\n{md}"
);
}
// ============================================================================
// 8. Empty document handling
// ============================================================================
#[tokio::test]
async fn test_empty_document_handling() {
let formats = [
OutputFormat::Plain,
OutputFormat::Markdown,
OutputFormat::Djot,
OutputFormat::Html,
];
for format in &formats {
let b = InternalDocumentBuilder::new("test");
let doc = b.build();
// Should not panic
let result = derive(doc, format.clone());
// Content can be empty or whitespace-only — the key is no panic
let text = effective_content(&result);
assert!(
text.len() < 100,
"Empty document in {format:?} should produce minimal output, got {} chars",
text.len()
);
}
}
// ============================================================================
// 9. Large document performance smoke test
// ============================================================================
#[tokio::test]
async fn test_large_document_renders_without_timeout() {
let mut b = InternalDocumentBuilder::new("test");
b.push_heading(1, "Large Document", None, None);
for i in 0..1000 {
b.push_paragraph(
&format!("Paragraph number {i} with some filler text to make it realistic."),
vec![],
None,
None,
);
}
let _doc = b.build();
// Render to each format — this is a smoke test; we just verify it completes.
let formats = [
OutputFormat::Plain,
OutputFormat::Markdown,
OutputFormat::Djot,
OutputFormat::Html,
];
for format in &formats {
// Clone the doc elements into a new document for each format
let mut b2 = InternalDocumentBuilder::new("test");
b2.push_heading(1, "Large Document", None, None);
for i in 0..1000 {
b2.push_paragraph(
&format!("Paragraph number {i} with some filler text to make it realistic."),
vec![],
None,
None,
);
}
let doc2 = b2.build();
let result = derive(doc2, format.clone());
let text = effective_content(&result);
assert!(
text.contains("Paragraph number 0"),
"{format:?}: should contain first paragraph"
);
assert!(
text.contains("Paragraph number 999"),
"{format:?}: should contain last paragraph"
);
assert!(
text.len() > 50_000,
"{format:?}: 1000-paragraph document should produce substantial output, got {} bytes",
text.len()
);
}
}