crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs

//! Regression tests for issue #962 — glyph-spaced PDF text fragmented one
//! character per line.
//!
//! When a PDF positions each character via a separate `BT … ET` block with a
//! sinusoidal y-jitter, pdf_oxide's ColumnAware reading order groups spans by
//! y-level rather than reading order, producing single-character spans that each
//! land on their own output line. Microsoft Word triggers this pattern for
//! "broken image" placeholder text
//! (`Het afbeelding onderdeel met relatie-id … is niet aangetroffen`).
//!
//! Fix: `oxide/text.rs` detects the fragmentation signature (≥ 3 same-line
//! x-disorder events among short spans; see `pdf::structure::constants`) and
//! rebuilds page text from span positions: sort by y-descending, group by
//! y-proximity, sort each group by x, insert spaces at word gaps.

#![cfg(feature = "pdf")]

use kreuzberg::{ExtractionConfig, extract_bytes_sync};

/// Build a minimal but valid single-page PDF whose content stream places each
/// character of `text` in its own `BT … ET` block via an absolute `Tm`
/// operator. The y-coordinate oscillates sinusoidally with amplitude
/// `jitter_pt` and period `JITTER_PERIOD`, replicating the pattern Microsoft
/// Word emits for broken-image placeholder text.
///
/// pdf_oxide's ColumnAware mode groups these single-character spans by y-level,
/// producing out-of-reading-order output that the fragmentation repair path detects
/// and corrects.
fn make_glyph_jitter_pdf(jitter_pt: f32) -> Vec<u8> {
    const TEXT: &str = "Hetafbeeldingisnietsaangetroffen";
    const FONT_SIZE: f32 = 12.0;
    const JITTER_PERIOD: usize = 6;
    const X_START: f32 = 72.0;
    const X_STEP: f32 = 7.0;
    const Y_BASE: f32 = 700.0;

    let mut stream = String::new();
    for (i, ch) in TEXT.chars().enumerate() {
        let x = X_START + i as f32 * X_STEP;
        let angle = std::f64::consts::TAU * i as f64 / JITTER_PERIOD as f64;
        let y = Y_BASE + angle.sin() as f32 * jitter_pt;
        let escaped = match ch {
            '(' => "\\(".to_string(),
            ')' => "\\)".to_string(),
            '\\' => "\\\\".to_string(),
            c => c.to_string(),
        };
        stream.push_str(&format!(
            "BT /F1 {FONT_SIZE} Tf 1 0 0 1 {x:.2} {y:.4} Tm ({escaped}) Tj ET\n"
        ));
    }

    // Assemble PDF object by object, recording byte offsets for the xref table.
    let mut pdf: Vec<u8> = Vec::new();

    macro_rules! push {
        ($s:expr) => {
            pdf.extend_from_slice($s.as_bytes())
        };
    }

    push!("%PDF-1.4\n");

    let off1 = pdf.len();
    push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");

    let off2 = pdf.len();
    push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");

    let off3 = pdf.len();
    push!(
        "3 0 obj\n\
         << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\
         \n   /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\
         endobj\n"
    );

    let off4 = pdf.len();
    let stream_bytes = stream.as_bytes();
    push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));
    pdf.extend_from_slice(stream_bytes);
    push!("\nendstream\nendobj\n");

    let off5 = pdf.len();
    push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");

    let xref_off = pdf.len();
    push!(format!(
        "xref\n0 6\n\
         0000000000 65535 f \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n",
        off1, off2, off3, off4, off5
    ));
    push!(format!(
        "trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"
    ));

    pdf
}

fn count_single_char_lines(text: &str) -> usize {
    text.lines().filter(|l| l.trim().chars().count() == 1).count()
}

/// Shared helper: build a minimal single-page PDF from a ready-made content stream string.
fn assemble_single_page_pdf(stream: &str) -> Vec<u8> {
    let mut pdf: Vec<u8> = Vec::new();

    macro_rules! push {
        ($s:expr) => {
            pdf.extend_from_slice($s.as_bytes())
        };
    }

    push!("%PDF-1.4\n");

    let off1 = pdf.len();
    push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");

    let off2 = pdf.len();
    push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");

    let off3 = pdf.len();
    push!(
        "3 0 obj\n\
         << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\
         \n   /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\
         endobj\n"
    );

    let off4 = pdf.len();
    let stream_bytes = stream.as_bytes();
    push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));
    pdf.extend_from_slice(stream_bytes);
    push!("\nendstream\nendobj\n");

    let off5 = pdf.len();
    push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");

    let xref_off = pdf.len();
    push!(format!(
        "xref\n0 6\n\
         0000000000 65535 f \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n\
         {:010} 00000 n \r\n",
        off1, off2, off3, off4, off5
    ));
    push!(format!(
        "trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"
    ));

    pdf
}

/// A PDF with two clearly separate text lines (y gap = 30 pt, well above the coalesce threshold).
/// Used to verify that multi-line content stays on two lines after the fix.
///
/// Line 1 at y=700: "FirstLine"
/// Line 2 at y=670: "SecondLine"
/// No jitter — one BT block per line, absolute Tm positioning.
fn make_two_line_pdf() -> Vec<u8> {
    let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (FirstLine) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 72.00 670.00 Tm (SecondLine) Tj ET\n";
    assemble_single_page_pdf(stream)
}

/// A PDF with two words on the same line separated by a large x-gap (> font_size * 0.5).
/// Used to verify space insertion between words.
///
/// "Hello" starting at x=72, "World" starting at x=300.
/// All chars at same y, no jitter. Uses absolute Tm positioning so pdf_oxide can
/// correctly determine each span's position.
fn make_word_gap_pdf() -> Vec<u8> {
    let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (Hello) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 300.00 700.00 Tm (World) Tj ET\n";
    assemble_single_page_pdf(stream)
}

/// A PDF with normal word-level text (no per-glyph Tj, no jitter).
/// `is_fragmented_span_list` must return false and content must be unchanged.
fn make_normal_prose_pdf() -> Vec<u8> {
    // Single BT block — all words in one run; no glyph-level fragmentation.
    let stream = "BT /F1 12 Tf 72 700 Td (The quick brown fox) Tj ET\n";
    assemble_single_page_pdf(stream)
}

/// After the fix, 3.5 pt jitter must produce no more than 4 single-character
/// lines (a few isolated chars are acceptable — runs of 18 are not).
#[test]
fn test_3_5pt_jitter_coalesced() {
    let pdf = make_glyph_jitter_pdf(3.5);
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");

    let content = result.content.trim().to_string();
    let single_char_lines = count_single_char_lines(&content);

    assert!(
        single_char_lines < 5,
        "issue #962 regression (3.5 pt): {single_char_lines} single-char lines after fix.\n\
         Content: {content:?}"
    );
}

/// 4.0 pt jitter: same guarantee as 3.5 pt.
#[test]
fn test_4_0pt_jitter_coalesced() {
    let pdf = make_glyph_jitter_pdf(4.0);
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("4.0 pt jitter PDF should extract without error");

    let content = result.content.trim().to_string();
    let single_char_lines = count_single_char_lines(&content);

    assert!(
        single_char_lines < 5,
        "issue #962 regression (4.0 pt): {single_char_lines} single-char lines after fix.\n\
         Content: {content:?}"
    );
}

/// 3.0 pt jitter: pdfium already coalesces these — the fix must not disturb them.
#[test]
fn test_3_0pt_jitter_unchanged() {
    let pdf = make_glyph_jitter_pdf(3.0);
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.0 pt jitter PDF should extract without error");

    let content = result.content.trim().to_string();
    let single_char_lines = count_single_char_lines(&content);

    assert!(
        single_char_lines < 5,
        "3.0 pt jitter (already-coalesced) regressed: {single_char_lines} single-char lines.\n\
         Content: {content:?}"
    );
    assert!(!content.is_empty(), "3.0 pt jitter PDF must produce non-empty content");
}

/// The fix must not panic or return an error on any generated fixture.
#[test]
fn test_all_fixtures_loadable() {
    let config = ExtractionConfig::default();
    for (label, jitter) in [("3.5pt", 3.5f32), ("4.0pt", 4.0), ("3.0pt", 3.0)] {
        let pdf = make_glyph_jitter_pdf(jitter);
        let result = extract_bytes_sync(&pdf, "application/pdf", &config);
        assert!(
            result.is_ok(),
            "[{label}] extraction should not error: {:?}",
            result.err()
        );
        let r = result.unwrap();
        assert!(!r.content.trim().is_empty(), "[{label}] must produce non-empty content");
    }
}

/// The coalesced text must actually contain the expected characters in order.
///
/// TEXT = "Hetafbeeldingisnietsaangetroffen" (32 chars). After rebuilding from
/// char positions the characters must all be present; spaces may be injected
/// between some chars but the non-space characters must spell out the word.
#[test]
fn test_coalesced_content_is_coherent() {
    let pdf = make_glyph_jitter_pdf(3.5);
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");

    let content = result.content.trim().to_string();
    // Drop spaces injected by the gap-detection heuristic and check the chars are present.
    let no_spaces: String = content.chars().filter(|c| !c.is_whitespace()).collect();
    assert!(
        no_spaces.contains("Hetafbeelding"),
        "coalesced content must contain the leading chars of the original word; got: {content:?}"
    );
}

/// Two real text lines (30pt y gap) must remain as two separate lines after the fix.
#[test]
fn test_two_line_pdf_stays_two_lines() {
    let pdf = make_two_line_pdf();
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("two-line PDF should extract without error");

    let content = result.content.trim().to_string();
    assert!(
        content.contains("FirstLine"),
        "output must contain 'FirstLine'; got: {content:?}"
    );
    assert!(
        content.contains("SecondLine"),
        "output must contain 'SecondLine'; got: {content:?}"
    );
    // The two lines must be separated (not merged into one line).
    let line_count = content.lines().count();
    assert!(
        line_count >= 2,
        "two-line PDF must produce at least 2 output lines; got {line_count}: {content:?}"
    );
}

/// Two words with a large x-gap on the same line must have a space between them.
#[test]
fn test_word_gap_produces_space() {
    let pdf = make_word_gap_pdf();
    let config = ExtractionConfig::default();
    let result =
        extract_bytes_sync(&pdf, "application/pdf", &config).expect("word-gap PDF should extract without error");

    let content = result.content.trim().to_string();
    assert!(
        content.contains("Hello"),
        "output must contain 'Hello'; got: {content:?}"
    );
    assert!(
        content.contains("World"),
        "output must contain 'World'; got: {content:?}"
    );
    // Both words must appear with some separator (space or newline) between them.
    assert!(
        content.contains("Hello World") || content.contains("Hello\nWorld"),
        "output must have 'Hello World' or 'Hello\\nWorld'; got: {content:?}"
    );
}

/// Normal word-level prose PDF must not be disturbed.
#[test]
fn test_normal_prose_not_disturbed() {
    let pdf = make_normal_prose_pdf();
    let config = ExtractionConfig::default();
    let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("normal prose should extract");
    let content = result.content.trim().to_string();
    assert!(!content.is_empty(), "normal prose must produce non-empty content");
    assert!(content.contains("quick"), "must include 'quick'; got: {content:?}");
    assert!(
        count_single_char_lines(&content) < 2,
        "prose must not fragment; got: {content:?}"
    );
}

/// Fix must apply when page tracking is enabled.
#[test]
fn test_fix_applies_with_page_tracking() {
    use kreuzberg::PageConfig;
    let pdf = make_glyph_jitter_pdf(3.5);
    let config = ExtractionConfig {
        pages: Some(PageConfig {
            extract_pages: true,
            ..Default::default()
        }),
        ..Default::default()
    };
    let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("page tracking extract");
    let content = result.content.trim().to_string();
    assert!(
        count_single_char_lines(&content) < 5,
        "page tracking fix failed; got: {content:?}"
    );
    assert!(result.pages.is_some(), "page tracking must populate pages");
}

/// 5pt jitter must also be coalesced.
#[test]
fn test_5pt_jitter_coalesced() {
    let pdf = make_glyph_jitter_pdf(5.0);
    let config = ExtractionConfig::default();
    let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("5pt extract");
    let content = result.content.trim().to_string();
    assert!(
        count_single_char_lines(&content) < 5,
        "5pt jitter not coalesced; got: {content:?}"
    );
}

/// Negative regression: a PDF with genuine single-character-per-line content
/// (e.g. a vertical column label, formula subscript stack, or CJK-like layout)
/// must round-trip unchanged — the fragmentation repair path must NOT activate.
///
/// Uses 20 pt y-spacing between single-character spans, which is well above the
/// MAX_GLYPH_JITTER_PT detection ceiling (5 pt) and above the COALESCE_THRESHOLD
/// (5 pt), so no same-line x-disorder events can occur and reconstruction is skipped.
/// This guards against false positives on poetry, code columns, and similar layouts.
#[test]
fn test_genuine_single_char_lines_not_collapsed() {
    // Five stacked single-character spans at 20 pt y-intervals — genuinely one char per line.
    let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (A) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 72.00 680.00 Tm (B) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 72.00 660.00 Tm (C) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 72.00 640.00 Tm (D) Tj ET\n\
                  BT /F1 12 Tf 1 0 0 1 72.00 620.00 Tm (E) Tj ET\n";
    let pdf = assemble_single_page_pdf(stream);
    let config = ExtractionConfig::default();
    let result = extract_bytes_sync(&pdf, "application/pdf", &config)
        .expect("single-char-per-line PDF should extract without error");

    let content = result.content.trim().to_string();
    // All five characters must be present.
    for ch in ["A", "B", "C", "D", "E"] {
        assert!(content.contains(ch), "output must contain '{ch}'; got: {content:?}");
    }
    // Characters must NOT be collapsed onto a single line; expect ≥ 5 separate lines.
    let line_count = content.lines().count();
    assert!(
        line_count >= 5,
        "genuine single-char-per-line content must not be collapsed; \
         got {line_count} lines: {content:?}"
    );
}
Nomad changes 2026-06-01 23:40:55 +02:00			`//! Regression tests for issue #962 — glyph-spaced PDF text fragmented one`
			`//! character per line.`
			`//!`
			//! When a PDF positions each character via a separate `BT … ET` block with a
			`//! sinusoidal y-jitter, pdf_oxide's ColumnAware reading order groups spans by`
			`//! y-level rather than reading order, producing single-character spans that each`
			`//! land on their own output line. Microsoft Word triggers this pattern for`
			`//! "broken image" placeholder text`
			//! (`Het afbeelding onderdeel met relatie-id … is niet aangetroffen`).
			`//!`
			//! Fix: `oxide/text.rs` detects the fragmentation signature (≥ 3 same-line
			//! x-disorder events among short spans; see `pdf::structure::constants`) and
			`//! rebuilds page text from span positions: sort by y-descending, group by`
			`//! y-proximity, sort each group by x, insert spaces at word gaps.`

			`#![cfg(feature = "pdf")]`

			`use kreuzberg::{ExtractionConfig, extract_bytes_sync};`

			`/// Build a minimal but valid single-page PDF whose content stream places each`
			/// character of `text` in its own `BT … ET` block via an absolute `Tm`
			`/// operator. The y-coordinate oscillates sinusoidally with amplitude`
			/// `jitter_pt` and period `JITTER_PERIOD`, replicating the pattern Microsoft
			`/// Word emits for broken-image placeholder text.`
			`///`
			`/// pdf_oxide's ColumnAware mode groups these single-character spans by y-level,`
			`/// producing out-of-reading-order output that the fragmentation repair path detects`
			`/// and corrects.`
			`fn make_glyph_jitter_pdf(jitter_pt: f32) -> Vec<u8> {`
			`const TEXT: &str = "Hetafbeeldingisnietsaangetroffen";`
			`const FONT_SIZE: f32 = 12.0;`
			`const JITTER_PERIOD: usize = 6;`
			`const X_START: f32 = 72.0;`
			`const X_STEP: f32 = 7.0;`
			`const Y_BASE: f32 = 700.0;`

			`let mut stream = String::new();`
			`for (i, ch) in TEXT.chars().enumerate() {`
			`let x = X_START + i as f32 * X_STEP;`
			`let angle = std::f64::consts::TAU * i as f64 / JITTER_PERIOD as f64;`
			`let y = Y_BASE + angle.sin() as f32 * jitter_pt;`
			`let escaped = match ch {`
			`'(' => "\\(".to_string(),`
			`')' => "\\)".to_string(),`
			`'\\' => "\\\\".to_string(),`
			`c => c.to_string(),`
			`};`
			`stream.push_str(&format!(`
			`"BT /F1 {FONT_SIZE} Tf 1 0 0 1 {x:.2} {y:.4} Tm ({escaped}) Tj ET\n"`
			`));`
			`}`

			`// Assemble PDF object by object, recording byte offsets for the xref table.`
			`let mut pdf: Vec<u8> = Vec::new();`

			`macro_rules! push {`
			`($s:expr) => {`
			`pdf.extend_from_slice($s.as_bytes())`
			`};`
			`}`

			`push!("%PDF-1.4\n");`

			`let off1 = pdf.len();`
			`push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");`

			`let off2 = pdf.len();`
			`push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");`

			`let off3 = pdf.len();`
			`push!(`
			`"3 0 obj\n\`
			`<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\`
			`\n /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\`
			`endobj\n"`
			`);`

			`let off4 = pdf.len();`
			`let stream_bytes = stream.as_bytes();`
			`push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));`
			`pdf.extend_from_slice(stream_bytes);`
			`push!("\nendstream\nendobj\n");`

			`let off5 = pdf.len();`
			`push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");`

			`let xref_off = pdf.len();`
			`push!(format!(`
			`"xref\n0 6\n\`
			`0000000000 65535 f \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n",`
			`off1, off2, off3, off4, off5`
			`));`
			`push!(format!(`
			`"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"`
			`));`

			`pdf`
			`}`

			`fn count_single_char_lines(text: &str) -> usize {`
			`text.lines().filter(\|l\| l.trim().chars().count() == 1).count()`
			`}`

			`/// Shared helper: build a minimal single-page PDF from a ready-made content stream string.`
			`fn assemble_single_page_pdf(stream: &str) -> Vec<u8> {`
			`let mut pdf: Vec<u8> = Vec::new();`

			`macro_rules! push {`
			`($s:expr) => {`
			`pdf.extend_from_slice($s.as_bytes())`
			`};`
			`}`

			`push!("%PDF-1.4\n");`

			`let off1 = pdf.len();`
			`push!("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");`

			`let off2 = pdf.len();`
			`push!("2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");`

			`let off3 = pdf.len();`
			`push!(`
			`"3 0 obj\n\`
			`<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]\`
			`\n /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\n\`
			`endobj\n"`
			`);`

			`let off4 = pdf.len();`
			`let stream_bytes = stream.as_bytes();`
			`push!(format!("4 0 obj\n<< /Length {} >>\nstream\n", stream_bytes.len()));`
			`pdf.extend_from_slice(stream_bytes);`
			`push!("\nendstream\nendobj\n");`

			`let off5 = pdf.len();`
			`push!("5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");`

			`let xref_off = pdf.len();`
			`push!(format!(`
			`"xref\n0 6\n\`
			`0000000000 65535 f \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n\`
			`{:010} 00000 n \r\n",`
			`off1, off2, off3, off4, off5`
			`));`
			`push!(format!(`
			`"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF\n"`
			`));`

			`pdf`
			`}`

			`/// A PDF with two clearly separate text lines (y gap = 30 pt, well above the coalesce threshold).`
			`/// Used to verify that multi-line content stays on two lines after the fix.`
			`///`
			`/// Line 1 at y=700: "FirstLine"`
			`/// Line 2 at y=670: "SecondLine"`
			`/// No jitter — one BT block per line, absolute Tm positioning.`
			`fn make_two_line_pdf() -> Vec<u8> {`
			`let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (FirstLine) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 72.00 670.00 Tm (SecondLine) Tj ET\n";`
			`assemble_single_page_pdf(stream)`
			`}`

			`/// A PDF with two words on the same line separated by a large x-gap (> font_size * 0.5).`
			`/// Used to verify space insertion between words.`
			`///`
			`/// "Hello" starting at x=72, "World" starting at x=300.`
			`/// All chars at same y, no jitter. Uses absolute Tm positioning so pdf_oxide can`
			`/// correctly determine each span's position.`
			`fn make_word_gap_pdf() -> Vec<u8> {`
			`let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (Hello) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 300.00 700.00 Tm (World) Tj ET\n";`
			`assemble_single_page_pdf(stream)`
			`}`

			`/// A PDF with normal word-level text (no per-glyph Tj, no jitter).`
			/// `is_fragmented_span_list` must return false and content must be unchanged.
			`fn make_normal_prose_pdf() -> Vec<u8> {`
			`// Single BT block — all words in one run; no glyph-level fragmentation.`
			`let stream = "BT /F1 12 Tf 72 700 Td (The quick brown fox) Tj ET\n";`
			`assemble_single_page_pdf(stream)`
			`}`

			`/// After the fix, 3.5 pt jitter must produce no more than 4 single-character`
			`/// lines (a few isolated chars are acceptable — runs of 18 are not).`
			`#[test]`
			`fn test_3_5pt_jitter_coalesced() {`
			`let pdf = make_glyph_jitter_pdf(3.5);`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`let single_char_lines = count_single_char_lines(&content);`

			`assert!(`
			`single_char_lines < 5,`
			`"issue #962 regression (3.5 pt): {single_char_lines} single-char lines after fix.\n\`
			`Content: {content:?}"`
			`);`
			`}`

			`/// 4.0 pt jitter: same guarantee as 3.5 pt.`
			`#[test]`
			`fn test_4_0pt_jitter_coalesced() {`
			`let pdf = make_glyph_jitter_pdf(4.0);`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("4.0 pt jitter PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`let single_char_lines = count_single_char_lines(&content);`

			`assert!(`
			`single_char_lines < 5,`
			`"issue #962 regression (4.0 pt): {single_char_lines} single-char lines after fix.\n\`
			`Content: {content:?}"`
			`);`
			`}`

			`/// 3.0 pt jitter: pdfium already coalesces these — the fix must not disturb them.`
			`#[test]`
			`fn test_3_0pt_jitter_unchanged() {`
			`let pdf = make_glyph_jitter_pdf(3.0);`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.0 pt jitter PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`let single_char_lines = count_single_char_lines(&content);`

			`assert!(`
			`single_char_lines < 5,`
			`"3.0 pt jitter (already-coalesced) regressed: {single_char_lines} single-char lines.\n\`
			`Content: {content:?}"`
			`);`
			`assert!(!content.is_empty(), "3.0 pt jitter PDF must produce non-empty content");`
			`}`

			`/// The fix must not panic or return an error on any generated fixture.`
			`#[test]`
			`fn test_all_fixtures_loadable() {`
			`let config = ExtractionConfig::default();`
			`for (label, jitter) in [("3.5pt", 3.5f32), ("4.0pt", 4.0), ("3.0pt", 3.0)] {`
			`let pdf = make_glyph_jitter_pdf(jitter);`
			`let result = extract_bytes_sync(&pdf, "application/pdf", &config);`
			`assert!(`
			`result.is_ok(),`
			`"[{label}] extraction should not error: {:?}",`
			`result.err()`
			`);`
			`let r = result.unwrap();`
			`assert!(!r.content.trim().is_empty(), "[{label}] must produce non-empty content");`
			`}`
			`}`

			`/// The coalesced text must actually contain the expected characters in order.`
			`///`
			`/// TEXT = "Hetafbeeldingisnietsaangetroffen" (32 chars). After rebuilding from`
			`/// char positions the characters must all be present; spaces may be injected`
			`/// between some chars but the non-space characters must spell out the word.`
			`#[test]`
			`fn test_coalesced_content_is_coherent() {`
			`let pdf = make_glyph_jitter_pdf(3.5);`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("3.5 pt jitter PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`// Drop spaces injected by the gap-detection heuristic and check the chars are present.`
			`let no_spaces: String = content.chars().filter(\|c\| !c.is_whitespace()).collect();`
			`assert!(`
			`no_spaces.contains("Hetafbeelding"),`
			`"coalesced content must contain the leading chars of the original word; got: {content:?}"`
			`);`
			`}`

			`/// Two real text lines (30pt y gap) must remain as two separate lines after the fix.`
			`#[test]`
			`fn test_two_line_pdf_stays_two_lines() {`
			`let pdf = make_two_line_pdf();`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("two-line PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`assert!(`
			`content.contains("FirstLine"),`
			`"output must contain 'FirstLine'; got: {content:?}"`
			`);`
			`assert!(`
			`content.contains("SecondLine"),`
			`"output must contain 'SecondLine'; got: {content:?}"`
			`);`
			`// The two lines must be separated (not merged into one line).`
			`let line_count = content.lines().count();`
			`assert!(`
			`line_count >= 2,`
			`"two-line PDF must produce at least 2 output lines; got {line_count}: {content:?}"`
			`);`
			`}`

			`/// Two words with a large x-gap on the same line must have a space between them.`
			`#[test]`
			`fn test_word_gap_produces_space() {`
			`let pdf = make_word_gap_pdf();`
			`let config = ExtractionConfig::default();`
			`let result =`
			`extract_bytes_sync(&pdf, "application/pdf", &config).expect("word-gap PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`assert!(`
			`content.contains("Hello"),`
			`"output must contain 'Hello'; got: {content:?}"`
			`);`
			`assert!(`
			`content.contains("World"),`
			`"output must contain 'World'; got: {content:?}"`
			`);`
			`// Both words must appear with some separator (space or newline) between them.`
			`assert!(`
			`content.contains("Hello World") \|\| content.contains("Hello\nWorld"),`
			`"output must have 'Hello World' or 'Hello\\nWorld'; got: {content:?}"`
			`);`
			`}`

			`/// Normal word-level prose PDF must not be disturbed.`
			`#[test]`
			`fn test_normal_prose_not_disturbed() {`
			`let pdf = make_normal_prose_pdf();`
			`let config = ExtractionConfig::default();`
			`let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("normal prose should extract");`
			`let content = result.content.trim().to_string();`
			`assert!(!content.is_empty(), "normal prose must produce non-empty content");`
			`assert!(content.contains("quick"), "must include 'quick'; got: {content:?}");`
			`assert!(`
			`count_single_char_lines(&content) < 2,`
			`"prose must not fragment; got: {content:?}"`
			`);`
			`}`

			`/// Fix must apply when page tracking is enabled.`
			`#[test]`
			`fn test_fix_applies_with_page_tracking() {`
			`use kreuzberg::PageConfig;`
			`let pdf = make_glyph_jitter_pdf(3.5);`
			`let config = ExtractionConfig {`
			`pages: Some(PageConfig {`
			`extract_pages: true,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`};`
			`let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("page tracking extract");`
			`let content = result.content.trim().to_string();`
			`assert!(`
			`count_single_char_lines(&content) < 5,`
			`"page tracking fix failed; got: {content:?}"`
			`);`
			`assert!(result.pages.is_some(), "page tracking must populate pages");`
			`}`

			`/// 5pt jitter must also be coalesced.`
			`#[test]`
			`fn test_5pt_jitter_coalesced() {`
			`let pdf = make_glyph_jitter_pdf(5.0);`
			`let config = ExtractionConfig::default();`
			`let result = extract_bytes_sync(&pdf, "application/pdf", &config).expect("5pt extract");`
			`let content = result.content.trim().to_string();`
			`assert!(`
			`count_single_char_lines(&content) < 5,`
			`"5pt jitter not coalesced; got: {content:?}"`
			`);`
			`}`

			`/// Negative regression: a PDF with genuine single-character-per-line content`
			`/// (e.g. a vertical column label, formula subscript stack, or CJK-like layout)`
			`/// must round-trip unchanged — the fragmentation repair path must NOT activate.`
			`///`
			`/// Uses 20 pt y-spacing between single-character spans, which is well above the`
			`/// MAX_GLYPH_JITTER_PT detection ceiling (5 pt) and above the COALESCE_THRESHOLD`
			`/// (5 pt), so no same-line x-disorder events can occur and reconstruction is skipped.`
			`/// This guards against false positives on poetry, code columns, and similar layouts.`
			`#[test]`
			`fn test_genuine_single_char_lines_not_collapsed() {`
			`// Five stacked single-character spans at 20 pt y-intervals — genuinely one char per line.`
			`let stream = "BT /F1 12 Tf 1 0 0 1 72.00 700.00 Tm (A) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 72.00 680.00 Tm (B) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 72.00 660.00 Tm (C) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 72.00 640.00 Tm (D) Tj ET\n\`
			`BT /F1 12 Tf 1 0 0 1 72.00 620.00 Tm (E) Tj ET\n";`
			`let pdf = assemble_single_page_pdf(stream);`
			`let config = ExtractionConfig::default();`
			`let result = extract_bytes_sync(&pdf, "application/pdf", &config)`
			`.expect("single-char-per-line PDF should extract without error");`

			`let content = result.content.trim().to_string();`
			`// All five characters must be present.`
			`for ch in ["A", "B", "C", "D", "E"] {`
			`assert!(content.contains(ch), "output must contain '{ch}'; got: {content:?}");`
			`}`
			`// Characters must NOT be collapsed onto a single line; expect ≥ 5 separate lines.`
			`let line_count = content.lines().count();`
			`assert!(`
			`line_count >= 5,`
			`"genuine single-char-per-line content must not be collapsed; \`
			`got {line_count} lines: {content:?}"`
			`);`
			`}`