155 lines
5.7 KiB
Rust
155 lines
5.7 KiB
Rust
|
|
//! Regression test for https://github.com/kreuzberg-dev/kreuzberg/issues/706
|
|||
|
|
//!
|
|||
|
|
//! Tesseract OCR was producing corrupted page content: the top-level `content`
|
|||
|
|
//! field contained the coherent HOCR-rendered text followed by a word-by-word
|
|||
|
|
//! dump of every OcrText element, effectively doubling the output.
|
|||
|
|
//!
|
|||
|
|
//! Root cause: `inject_ocr_elements_from_vec` pushed each OcrElement into
|
|||
|
|
//! `InternalDocument::elements` as an `ElementKind::OcrText`. The rendering
|
|||
|
|
//! pipeline (`render_plain`) iterated those elements and appended every word
|
|||
|
|
//! token back into `content`, on top of the already-rendered HOCR string.
|
|||
|
|
//!
|
|||
|
|
//! Fix: OCR elements are now stored directly in `InternalDocument::prebuilt_ocr_elements`
|
|||
|
|
//! (bypassing the rendering pipeline) and page content is set via
|
|||
|
|
//! `InternalDocument::prebuilt_pages` (bypassing the word-grouped fallback in
|
|||
|
|
//! `build_pages`).
|
|||
|
|
|
|||
|
|
#![cfg(feature = "ocr")]
|
|||
|
|
|
|||
|
|
mod helpers;
|
|||
|
|
|
|||
|
|
use helpers::*;
|
|||
|
|
use kreuzberg::core::config::{ExtractionConfig, OcrConfig, PageConfig};
|
|||
|
|
use kreuzberg::extract_file_sync;
|
|||
|
|
|
|||
|
|
/// Content must not be doubled when OCR is enabled.
|
|||
|
|
///
|
|||
|
|
/// Before the fix, `content` contained the HOCR-rendered paragraph text
|
|||
|
|
/// immediately followed by a word-by-word dump of every OcrText element,
|
|||
|
|
/// roughly doubling the word count. After the fix the two representations
|
|||
|
|
/// must be absent: `content` should equal approximately what is in `pages[0].content`.
|
|||
|
|
#[test]
|
|||
|
|
fn test_ocr_content_not_doubled() {
|
|||
|
|
if skip_if_missing("images/test_hello_world.png") {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = get_test_file_path("images/test_hello_world.png");
|
|||
|
|
let config = ExtractionConfig {
|
|||
|
|
ocr: Some(OcrConfig {
|
|||
|
|
backend: "tesseract".to_string(),
|
|||
|
|
language: "eng".to_string(),
|
|||
|
|
..Default::default()
|
|||
|
|
}),
|
|||
|
|
pages: Some(PageConfig {
|
|||
|
|
extract_pages: true,
|
|||
|
|
..Default::default()
|
|||
|
|
}),
|
|||
|
|
use_cache: false,
|
|||
|
|
..Default::default()
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
let result = extract_file_sync(&file_path, None, &config).expect("OCR extraction must succeed");
|
|||
|
|
|
|||
|
|
let content_words: Vec<&str> = result.content.split_whitespace().collect();
|
|||
|
|
|
|||
|
|
// If content is empty there is no text to double — skip the assertion.
|
|||
|
|
if content_words.is_empty() {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// The pages array must be populated.
|
|||
|
|
let pages = result
|
|||
|
|
.pages
|
|||
|
|
.as_ref()
|
|||
|
|
.expect("pages must be populated when extract_pages=true");
|
|||
|
|
assert!(!pages.is_empty(), "at least one page must be present");
|
|||
|
|
|
|||
|
|
let page_content = &pages[0].content;
|
|||
|
|
let page_words: Vec<&str> = page_content.split_whitespace().collect();
|
|||
|
|
|
|||
|
|
// Core invariant: top-level content word count must be close to page content
|
|||
|
|
// word count. Before the fix, content was roughly 2× the page content because
|
|||
|
|
// the word-element dump was appended after the HOCR text.
|
|||
|
|
//
|
|||
|
|
// Allow a 30 % margin to absorb minor whitespace / formatting differences.
|
|||
|
|
if !page_words.is_empty() {
|
|||
|
|
let ratio = content_words.len() as f64 / page_words.len() as f64;
|
|||
|
|
assert!(
|
|||
|
|
ratio <= 1.3,
|
|||
|
|
"content word count ({}) is more than 30% larger than pages[0].content word count ({}). \
|
|||
|
|
This indicates doubled output — word-token dump appended after HOCR text (issue #706). \
|
|||
|
|
ratio = {:.2}",
|
|||
|
|
content_words.len(),
|
|||
|
|
page_words.len(),
|
|||
|
|
ratio,
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Secondary invariant: the content string must NOT contain the page content
|
|||
|
|
// verbatim twice in a row (i.e. the string is not literally concatenated with
|
|||
|
|
// itself).
|
|||
|
|
if page_content.trim().len() > 4 {
|
|||
|
|
let trimmed = page_content.trim();
|
|||
|
|
let doubled = format!("{trimmed}{trimmed}");
|
|||
|
|
assert!(
|
|||
|
|
!result.content.contains(doubled.as_str()),
|
|||
|
|
"content appears to contain page text concatenated with itself — doubled output (issue #706)",
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Page content must match the top-level content (after trimming) when there
|
|||
|
|
/// is only one page, for any image that produces non-empty OCR output.
|
|||
|
|
#[test]
|
|||
|
|
fn test_ocr_page_content_matches_top_level_content() {
|
|||
|
|
if skip_if_missing("images/ocr_image.jpg") {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|||
|
|
let config = ExtractionConfig {
|
|||
|
|
ocr: Some(OcrConfig {
|
|||
|
|
backend: "tesseract".to_string(),
|
|||
|
|
language: "eng".to_string(),
|
|||
|
|
..Default::default()
|
|||
|
|
}),
|
|||
|
|
pages: Some(PageConfig {
|
|||
|
|
extract_pages: true,
|
|||
|
|
..Default::default()
|
|||
|
|
}),
|
|||
|
|
use_cache: false,
|
|||
|
|
..Default::default()
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
let result = extract_file_sync(&file_path, None, &config).expect("OCR extraction must succeed");
|
|||
|
|
|
|||
|
|
if result.content.trim().is_empty() {
|
|||
|
|
// No text detected — nothing to assert.
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let pages = result
|
|||
|
|
.pages
|
|||
|
|
.as_ref()
|
|||
|
|
.expect("pages must be populated when extract_pages=true");
|
|||
|
|
assert!(!pages.is_empty(), "at least one page must be present");
|
|||
|
|
|
|||
|
|
// For a single-page image the page content must not be dramatically shorter
|
|||
|
|
// than the top-level content. Before the fix, top-level content was bloated
|
|||
|
|
// with the word dump while page content was absent or minimal.
|
|||
|
|
let top_words = result.content.split_whitespace().count();
|
|||
|
|
let page_words = pages[0].content.split_whitespace().count();
|
|||
|
|
|
|||
|
|
if top_words > 0 && page_words > 0 {
|
|||
|
|
let ratio = top_words as f64 / page_words.max(1) as f64;
|
|||
|
|
assert!(
|
|||
|
|
ratio <= 1.3,
|
|||
|
|
"top-level content ({} words) is more than 30% larger than pages[0].content ({} words). \
|
|||
|
|
Indicates word-dump appended to top-level content but missing from page — issue #706.",
|
|||
|
|
top_words,
|
|||
|
|
page_words,
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
}
|