Files
fil/crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs

420 lines
16 KiB
Rust
Raw Normal View History

2026-06-01 23:40:55 +02:00
//! Regression tests for issue #962 — glyph-spaced PDF text fragmented one
//! character per line.
//!
//! When a PDF positions each character via a separate `BT … ET` block with a
//! sinusoidal y-jitter, pdf_oxide's ColumnAware reading order groups spans by
//! y-level rather than reading order, producing single-character spans that each
//! land on their own output line. Microsoft Word triggers this pattern for
//! "broken image" placeholder text
//! (`Het afbeelding onderdeel met relatie-id … is niet aangetroffen`).
//!
//! Fix: `oxide/text.rs` detects the fragmentation signature (≥ 3 same-line
//! x-disorder events among short spans; see `pdf::structure::constants`) and
//! rebuilds page text from span positions: sort by y-descending, group by
//! y-proximity, sort each group by x, insert spaces at word gaps.
#![cfg(feature = "pdf")]
use kreuzberg::{ExtractionConfig, extract_bytes_sync};
/// Build a minimal but valid single-page PDF whose content stream places each
/// character of `text` in its own `BT … ET` block via an absolute `Tm`
/// operator. The y-coordinate oscillates sinusoidally with amplitude
/// `jitter_pt` and period `JITTER_PERIOD`, replicating the pattern Microsoft
/// Word emits for broken-image placeholder text.
///
/// pdf_oxide's ColumnAware mode groups these single-character spans by y-level,
/// producing out-of-reading-order output that the fragmentation repair path detects
/// and corrects.
fn make_glyph_jitter_pdf(jitter_pt: f32) -> Vec<u8> {
const TEXT: &str = "Hetafbeeldingisnietsaangetroffen";
const FONT_SIZE: f32 = 12.0;
const JITTER_PERIOD: usize = 6;
const X_START: f32 = 72.0;
const X_STEP: f32 = 7.0;
const Y_BASE: f32 = 700.0;
let mut stream = String::new();
for (i, ch) in TEXT.chars().enumerate() {
let x = X_START + i as f32 * X_STEP;
let angle = std::f64::consts::TAU * i as f64 / JITTER_PERIOD as f64;
let y = Y_BASE + angle.sin() as f32 * jitter_pt;
let escaped = match ch {
'(' => "\\(".to_string(),
')' => "\\)".to_string(),
'\\' => "\\\\".to_string(),
c => c.to_string(),
};
stream.push_str(&format!(
"BT /F1 {FONT_SIZE} Tf 1 0 0 1 {x:.2} {y:.4} Tm ({escaped}) Tj ET\n"
));
}
// Assemble PDF object by object, recording byte offsets for the xref table.
let mut pdf: Vec<u8> = Vec::new();
macro_rules! push {
($s:expr) => {
pdf.extend_from_slice($s.as_bytes())
};
}
push!("%PDF-1.4\n");
let off1 = pdf.len();
push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
let off2 = pdf.len();
push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
let off3 = pdf.len();
push!(
"3 0 obj\n\
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\
\n /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\
endobj\n"
);
let off4 = pdf.len();
let stream_bytes = stream.as_bytes();
push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));
pdf.extend_from_slice(stream_bytes);
push!("\nendstream\nendobj\n");
let off5 = pdf.len();
push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");
let xref_off = pdf.len();
push!(format!(
"xref\n0 6\n\
0000000000 65535 f \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n",
off1, off2, off3, off4, off5
));
push!(format!(
"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"
));
pdf
}
fn count_single_char_lines(text: &str) -> usize {
text.lines().filter(|l| l.trim().chars().count() == 1).count()
}
/// Shared helper: build a minimal single-page PDF from a ready-made content stream string.
fn assemble_single_page_pdf(stream: &str) -> Vec<u8> {
let mut pdf: Vec<u8> = Vec::new();
macro_rules! push {
($s:expr) => {
pdf.extend_from_slice($s.as_bytes())
};
}
push!("%PDF-1.4\n");
let off1 = pdf.len();
push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
let off2 = pdf.len();
push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");
let off3 = pdf.len();
push!(
"3 0 obj\n\
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\
\n /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\
endobj\n"
);
let off4 = pdf.len();
let stream_bytes = stream.as_bytes();
push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));
pdf.extend_from_slice(stream_bytes);
push!("\nendstream\nendobj\n");
let off5 = pdf.len();
push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");
let xref_off = pdf.len();
push!(format!(
"xref\n0 6\n\
0000000000 65535 f \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n\
{:010} 00000 n \r\n",
off1, off2, off3, off4, off5
));
push!(format!(
"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"
));
pdf
}
/// A PDF with two clearly separate text lines (y gap = 30 pt, well above the coalesce threshold).
/// Used to verify that multi-line content stays on two lines after the fix.
///
/// Line 1 at y=700: "FirstLine"
/// Line 2 at y=670: "SecondLine"
/// No jitter — one BT block per line, absolute Tm positioning.
fn make_two_line_pdf() -> Vec<u8> {
let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (FirstLine) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 72.00 670.00 Tm (SecondLine) Tj ET\n";
assemble_single_page_pdf(stream)
}
/// A PDF with two words on the same line separated by a large x-gap (> font_size * 0.5).
/// Used to verify space insertion between words.
///
/// "Hello" starting at x=72, "World" starting at x=300.
/// All chars at same y, no jitter. Uses absolute Tm positioning so pdf_oxide can
/// correctly determine each span's position.
fn make_word_gap_pdf() -> Vec<u8> {
let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (Hello) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 300.00 700.00 Tm (World) Tj ET\n";
assemble_single_page_pdf(stream)
}
/// A PDF with normal word-level text (no per-glyph Tj, no jitter).
/// `is_fragmented_span_list` must return false and content must be unchanged.
fn make_normal_prose_pdf() -> Vec<u8> {
// Single BT block — all words in one run; no glyph-level fragmentation.
let stream = "BT /F1 12 Tf 72 700 Td (The quick brown fox) Tj ET\n";
assemble_single_page_pdf(stream)
}
/// After the fix, 3.5 pt jitter must produce no more than 4 single-character
/// lines (a few isolated chars are acceptable — runs of 18 are not).
#[test]
fn test_3_5pt_jitter_coalesced() {
let pdf = make_glyph_jitter_pdf(3.5);
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");
let content = result.content.trim().to_string();
let single_char_lines = count_single_char_lines(&content);
assert!(
single_char_lines < 5,
"issue #962 regression (3.5 pt): {single_char_lines} single-char lines after fix.\n\
Content: {content:?}"
);
}
/// 4.0 pt jitter: same guarantee as 3.5 pt.
#[test]
fn test_4_0pt_jitter_coalesced() {
let pdf = make_glyph_jitter_pdf(4.0);
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("4.0 pt jitter PDF should extract without error");
let content = result.content.trim().to_string();
let single_char_lines = count_single_char_lines(&content);
assert!(
single_char_lines < 5,
"issue #962 regression (4.0 pt): {single_char_lines} single-char lines after fix.\n\
Content: {content:?}"
);
}
/// 3.0 pt jitter: pdfium already coalesces these — the fix must not disturb them.
#[test]
fn test_3_0pt_jitter_unchanged() {
let pdf = make_glyph_jitter_pdf(3.0);
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.0 pt jitter PDF should extract without error");
let content = result.content.trim().to_string();
let single_char_lines = count_single_char_lines(&content);
assert!(
single_char_lines < 5,
"3.0 pt jitter (already-coalesced) regressed: {single_char_lines} single-char lines.\n\
Content: {content:?}"
);
assert!(!content.is_empty(), "3.0 pt jitter PDF must produce non-empty content");
}
/// The fix must not panic or return an error on any generated fixture.
#[test]
fn test_all_fixtures_loadable() {
let config = ExtractionConfig::default();
for (label, jitter) in [("3.5pt", 3.5f32), ("4.0pt", 4.0), ("3.0pt", 3.0)] {
let pdf = make_glyph_jitter_pdf(jitter);
let result = extract_bytes_sync(&pdf, "application/pdf", &config);
assert!(
result.is_ok(),
"[{label}] extraction should not error: {:?}",
result.err()
);
let r = result.unwrap();
assert!(!r.content.trim().is_empty(), "[{label}] must produce non-empty content");
}
}
/// The coalesced text must actually contain the expected characters in order.
///
/// TEXT = "Hetafbeeldingisnietsaangetroffen" (32 chars). After rebuilding from
/// char positions the characters must all be present; spaces may be injected
/// between some chars but the non-space characters must spell out the word.
#[test]
fn test_coalesced_content_is_coherent() {
let pdf = make_glyph_jitter_pdf(3.5);
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");
let content = result.content.trim().to_string();
// Drop spaces injected by the gap-detection heuristic and check the chars are present.
let no_spaces: String = content.chars().filter(|c| !c.is_whitespace()).collect();
assert!(
no_spaces.contains("Hetafbeelding"),
"coalesced content must contain the leading chars of the original word; got: {content:?}"
);
}
/// Two real text lines (30pt y gap) must remain as two separate lines after the fix.
#[test]
fn test_two_line_pdf_stays_two_lines() {
let pdf = make_two_line_pdf();
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("two-line PDF should extract without error");
let content = result.content.trim().to_string();
assert!(
content.contains("FirstLine"),
"output must contain 'FirstLine'; got: {content:?}"
);
assert!(
content.contains("SecondLine"),
"output must contain 'SecondLine'; got: {content:?}"
);
// The two lines must be separated (not merged into one line).
let line_count = content.lines().count();
assert!(
line_count >= 2,
"two-line PDF must produce at least 2 output lines; got {line_count}: {content:?}"
);
}
/// Two words with a large x-gap on the same line must have a space between them.
#[test]
fn test_word_gap_produces_space() {
let pdf = make_word_gap_pdf();
let config = ExtractionConfig::default();
let result =
extract_bytes_sync(&pdf, "application/pdf", &config).expect("word-gap PDF should extract without error");
let content = result.content.trim().to_string();
assert!(
content.contains("Hello"),
"output must contain 'Hello'; got: {content:?}"
);
assert!(
content.contains("World"),
"output must contain 'World'; got: {content:?}"
);
// Both words must appear with some separator (space or newline) between them.
assert!(
content.contains("Hello World") || content.contains("Hello\nWorld"),
"output must have 'Hello World' or 'Hello\\nWorld'; got: {content:?}"
);
}
/// Normal word-level prose PDF must not be disturbed.
#[test]
fn test_normal_prose_not_disturbed() {
let pdf = make_normal_prose_pdf();
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("normal prose should extract");
let content = result.content.trim().to_string();
assert!(!content.is_empty(), "normal prose must produce non-empty content");
assert!(content.contains("quick"), "must include 'quick'; got: {content:?}");
assert!(
count_single_char_lines(&content) < 2,
"prose must not fragment; got: {content:?}"
);
}
/// Fix must apply when page tracking is enabled.
#[test]
fn test_fix_applies_with_page_tracking() {
use kreuzberg::PageConfig;
let pdf = make_glyph_jitter_pdf(3.5);
let config = ExtractionConfig {
pages: Some(PageConfig {
extract_pages: true,
..Default::default()
}),
..Default::default()
};
let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("page tracking extract");
let content = result.content.trim().to_string();
assert!(
count_single_char_lines(&content) < 5,
"page tracking fix failed; got: {content:?}"
);
assert!(result.pages.is_some(), "page tracking must populate pages");
}
/// 5pt jitter must also be coalesced.
#[test]
fn test_5pt_jitter_coalesced() {
let pdf = make_glyph_jitter_pdf(5.0);
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("5pt extract");
let content = result.content.trim().to_string();
assert!(
count_single_char_lines(&content) < 5,
"5pt jitter not coalesced; got: {content:?}"
);
}
/// Negative regression: a PDF with genuine single-character-per-line content
/// (e.g. a vertical column label, formula subscript stack, or CJK-like layout)
/// must round-trip unchanged — the fragmentation repair path must NOT activate.
///
/// Uses 20 pt y-spacing between single-character spans, which is well above the
/// MAX_GLYPH_JITTER_PT detection ceiling (5 pt) and above the COALESCE_THRESHOLD
/// (5 pt), so no same-line x-disorder events can occur and reconstruction is skipped.
/// This guards against false positives on poetry, code columns, and similar layouts.
#[test]
fn test_genuine_single_char_lines_not_collapsed() {
// Five stacked single-character spans at 20 pt y-intervals — genuinely one char per line.
let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (A) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 72.00 680.00 Tm (B) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 72.00 660.00 Tm (C) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 72.00 640.00 Tm (D) Tj ET\n\
BT /F1 12 Tf 1 0 0 1 72.00 620.00 Tm (E) Tj ET\n";
let pdf = assemble_single_page_pdf(stream);
let config = ExtractionConfig::default();
let result = extract_bytes_sync(&pdf, "application/pdf", &config)
.expect("single-char-per-line PDF should extract without error");
let content = result.content.trim().to_string();
// All five characters must be present.
for ch in ["A", "B", "C", "D", "E"] {
assert!(content.contains(ch), "output must contain '{ch}'; got: {content:?}");
}
// Characters must NOT be collapsed onto a single line; expect ≥ 5 separate lines.
let line_count = content.lines().count();
assert!(
line_count >= 5,
"genuine single-char-per-line content must not be collapsed; \
got {line_count} lines: {content:?}"
);
}