Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg/tests/ocr_content_integrity.rs
+++ b/crates/kreuzberg/tests/ocr_content_integrity.rs
@@ -0,0 +1,154 @@
+//! Regression test for https://github.com/kreuzberg-dev/kreuzberg/issues/706
+//!
+//! Tesseract OCR was producing corrupted page content: the top-level `content`
+//! field contained the coherent HOCR-rendered text followed by a word-by-word
+//! dump of every OcrText element, effectively doubling the output.
+//!
+//! Root cause: `inject_ocr_elements_from_vec` pushed each OcrElement into
+//! `InternalDocument::elements` as an `ElementKind::OcrText`. The rendering
+//! pipeline (`render_plain`) iterated those elements and appended every word
+//! token back into `content`, on top of the already-rendered HOCR string.
+//!
+//! Fix: OCR elements are now stored directly in `InternalDocument::prebuilt_ocr_elements`
+//! (bypassing the rendering pipeline) and page content is set via
+//! `InternalDocument::prebuilt_pages` (bypassing the word-grouped fallback in
+//! `build_pages`).
+
+#![cfg(feature = "ocr")]
+
+mod helpers;
+
+use helpers::*;
+use kreuzberg::core::config::{ExtractionConfig, OcrConfig, PageConfig};
+use kreuzberg::extract_file_sync;
+
+/// Content must not be doubled when OCR is enabled.
+///
+/// Before the fix, `content` contained the HOCR-rendered paragraph text
+/// immediately followed by a word-by-word dump of every OcrText element,
+/// roughly doubling the word count.  After the fix the two representations
+/// must be absent: `content` should equal approximately what is in `pages[0].content`.
+#[test]
+fn test_ocr_content_not_doubled() {
+    if skip_if_missing("images/test_hello_world.png") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/test_hello_world.png");
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        pages: Some(PageConfig {
+            extract_pages: true,
+            ..Default::default()
+        }),
+        use_cache: false,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync(&file_path, None, &config).expect("OCR extraction must succeed");
+
+    let content_words: Vec<&str> = result.content.split_whitespace().collect();
+
+    // If content is empty there is no text to double — skip the assertion.
+    if content_words.is_empty() {
+        return;
+    }
+
+    // The pages array must be populated.
+    let pages = result
+        .pages
+        .as_ref()
+        .expect("pages must be populated when extract_pages=true");
+    assert!(!pages.is_empty(), "at least one page must be present");
+
+    let page_content = &pages[0].content;
+    let page_words: Vec<&str> = page_content.split_whitespace().collect();
+
+    // Core invariant: top-level content word count must be close to page content
+    // word count.  Before the fix, content was roughly 2× the page content because
+    // the word-element dump was appended after the HOCR text.
+    //
+    // Allow a 30 % margin to absorb minor whitespace / formatting differences.
+    if !page_words.is_empty() {
+        let ratio = content_words.len() as f64 / page_words.len() as f64;
+        assert!(
+            ratio <= 1.3,
+            "content word count ({}) is more than 30% larger than pages[0].content word count ({}). \
+             This indicates doubled output — word-token dump appended after HOCR text (issue #706). \
+             ratio = {:.2}",
+            content_words.len(),
+            page_words.len(),
+            ratio,
+        );
+    }
+
+    // Secondary invariant: the content string must NOT contain the page content
+    // verbatim twice in a row (i.e. the string is not literally concatenated with
+    // itself).
+    if page_content.trim().len() > 4 {
+        let trimmed = page_content.trim();
+        let doubled = format!("{trimmed}{trimmed}");
+        assert!(
+            !result.content.contains(doubled.as_str()),
+            "content appears to contain page text concatenated with itself — doubled output (issue #706)",
+        );
+    }
+}
+
+/// Page content must match the top-level content (after trimming) when there
+/// is only one page, for any image that produces non-empty OCR output.
+#[test]
+fn test_ocr_page_content_matches_top_level_content() {
+    if skip_if_missing("images/ocr_image.jpg") {
+        return;
+    }
+
+    let file_path = get_test_file_path("images/ocr_image.jpg");
+    let config = ExtractionConfig {
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            language: "eng".to_string(),
+            ..Default::default()
+        }),
+        pages: Some(PageConfig {
+            extract_pages: true,
+            ..Default::default()
+        }),
+        use_cache: false,
+        ..Default::default()
+    };
+
+    let result = extract_file_sync(&file_path, None, &config).expect("OCR extraction must succeed");
+
+    if result.content.trim().is_empty() {
+        // No text detected — nothing to assert.
+        return;
+    }
+
+    let pages = result
+        .pages
+        .as_ref()
+        .expect("pages must be populated when extract_pages=true");
+    assert!(!pages.is_empty(), "at least one page must be present");
+
+    // For a single-page image the page content must not be dramatically shorter
+    // than the top-level content. Before the fix, top-level content was bloated
+    // with the word dump while page content was absent or minimal.
+    let top_words = result.content.split_whitespace().count();
+    let page_words = pages[0].content.split_whitespace().count();
+
+    if top_words > 0 && page_words > 0 {
+        let ratio = top_words as f64 / page_words.max(1) as f64;
+        assert!(
+            ratio <= 1.3,
+            "top-level content ({} words) is more than 30% larger than pages[0].content ({} words). \
+             Indicates word-dump appended to top-level content but missing from page — issue #706.",
+            top_words,
+            page_words,
+        );
+    }
+}