Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,542 @@
//! Diff two [`ExtractionResult`] values.
//!
//! This module is gated behind the `diff` Cargo feature. Enable it by adding
//! `kreuzberg = { features = ["diff"] }` to your `Cargo.toml`.
//!
//! # Example
//!
//! ```rust,no_run
//! use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
//!
//! # fn main() {
//! let a = ExtractionResult::default();
//! let b = ExtractionResult::default();
//! let opts = DiffOptions::default();
//! let result = compare(&a, &b, &opts);
//! assert!(result.content_diff.is_empty());
//! # }
//! ```
pub mod types;
pub use types::{
CellChange, DiffHunk, DiffLine, DiffOptions, EmbeddedChanges, EmbeddedDiff, ExtractionDiff, TableDiff,
};
use similar::{ChangeTag, DiffOp, TextDiff};
use crate::types::extraction::{ArchiveEntry, ExtractionResult};
use crate::types::tables::Table;
/// Default number of context lines on each side of a changed region.
const CONTEXT_LINES: usize = 3;
/// Compare two extraction results and return a structured diff.
///
/// The comparison is purely structural — no I/O, no side effects. All fields
/// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
///
/// # Arguments
///
/// * `a` — the "before" extraction result
/// * `b` — the "after" extraction result
/// * `opts` — controls which sections are compared and optional truncation
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
///
/// # fn main() {
/// let mut a = ExtractionResult::default();
/// let mut b = ExtractionResult::default();
/// a.content = "Hello world".to_string();
/// b.content = "Hello Rust".to_string();
///
/// let diff = compare(&a, &b, &DiffOptions::default());
/// assert_eq!(diff.content_diff.len(), 1);
/// # }
/// ```
pub fn compare(a: &ExtractionResult, b: &ExtractionResult, opts: &DiffOptions) -> ExtractionDiff {
let content_diff = diff_content(&a.content, &b.content, opts);
let (tables_added, tables_removed, tables_changed) = diff_tables(&a.tables, &b.tables);
let metadata_changed = if opts.include_metadata {
diff_metadata(&a.metadata, &b.metadata)
} else {
serde_json::Value::Null
};
let embedded_changes = if opts.include_embedded {
diff_embedded(a.children.as_deref(), b.children.as_deref(), opts)
} else {
EmbeddedChanges {
added: vec![],
removed: vec![],
changed: vec![],
}
};
ExtractionDiff {
content_diff,
tables_added,
tables_removed,
tables_changed,
metadata_changed,
embedded_changes,
}
}
// ── Content diff ─────────────────────────────────────────────────────────────
fn diff_content(a: &str, b: &str, opts: &DiffOptions) -> Vec<DiffHunk> {
let a_text = apply_truncation(a, opts.max_content_chars);
let b_text = apply_truncation(b, opts.max_content_chars);
let a_ref: &str = a_text.as_deref().unwrap_or(a);
let b_ref: &str = b_text.as_deref().unwrap_or(b);
let text_diff = TextDiff::from_lines(a_ref, b_ref);
if text_diff.ratio() == 1.0 {
return vec![];
}
let mut hunks = Vec::new();
for group in text_diff.grouped_ops(CONTEXT_LINES) {
let hunk_from_line = hunk_old_start(&group);
let hunk_to_line = hunk_new_start(&group);
let hunk_from_count = hunk_old_len(&group);
let hunk_to_count = hunk_new_len(&group);
let mut lines = Vec::new();
for op in &group {
for change in text_diff.iter_changes(op) {
let text = change.value().trim_end_matches('\n').to_string();
let line = match change.tag() {
ChangeTag::Equal => DiffLine::Context(text),
ChangeTag::Insert => DiffLine::Added(text),
ChangeTag::Delete => DiffLine::Removed(text),
};
lines.push(line);
}
}
if !lines.is_empty() {
hunks.push(DiffHunk {
from_line: hunk_from_line,
from_count: hunk_from_count,
to_line: hunk_to_line,
to_count: hunk_to_count,
lines,
});
}
}
hunks
}
fn hunk_old_start(ops: &[DiffOp]) -> usize {
ops.first().map_or(0, |op| op.old_range().start)
}
fn hunk_new_start(ops: &[DiffOp]) -> usize {
ops.first().map_or(0, |op| op.new_range().start)
}
fn hunk_old_len(ops: &[DiffOp]) -> usize {
let start = ops.first().map_or(0, |op| op.old_range().start);
let end = ops.last().map_or(0, |op| op.old_range().end);
end.saturating_sub(start)
}
fn hunk_new_len(ops: &[DiffOp]) -> usize {
let start = ops.first().map_or(0, |op| op.new_range().start);
let end = ops.last().map_or(0, |op| op.new_range().end);
end.saturating_sub(start)
}
fn apply_truncation(text: &str, limit: Option<usize>) -> Option<String> {
limit.map(|n| {
let mut boundary = n.min(text.len());
while !text.is_char_boundary(boundary) {
boundary -= 1;
}
text[..boundary].to_string()
})
}
// ── Table diff ────────────────────────────────────────────────────────────────
fn diff_tables(a_tables: &[Table], b_tables: &[Table]) -> (Vec<Table>, Vec<Table>, Vec<TableDiff>) {
let min_len = a_tables.len().min(b_tables.len());
let mut tables_changed = Vec::new();
for idx in 0..min_len {
let a_t = &a_tables[idx];
let b_t = &b_tables[idx];
if tables_same_shape(a_t, b_t) {
let cell_changes = diff_cells(a_t, b_t);
if !cell_changes.is_empty() {
tables_changed.push(TableDiff {
from_index: idx,
to_index: idx,
cell_changes,
});
}
} else {
// Different shape — treat the pair as remove + add.
// The "removed" side is reported in tables_removed and "added" in tables_added.
// We handle this by falling through to the asymmetric slice handling below.
// But we need to signal that these shouldn't be counted as "paired" — so we
// emit them as add + remove even though they share the same index.
tables_changed.push(TableDiff {
from_index: idx,
to_index: idx,
// No cell-level changes: shapes differ; report as a structural replacement.
cell_changes: vec![],
});
}
}
let tables_removed: Vec<Table> = if a_tables.len() > b_tables.len() {
a_tables[min_len..].to_vec()
} else {
vec![]
};
let tables_added: Vec<Table> = if b_tables.len() > a_tables.len() {
b_tables[min_len..].to_vec()
} else {
vec![]
};
(tables_added, tables_removed, tables_changed)
}
/// Two tables are considered the same shape if and only if their row and column counts match.
///
/// Header content is NOT compared — column reordering with the same dimensions will produce
/// per-cell `CellChange` entries for every cell whose value differs, not a structural replacement.
///
/// TODO: smarter shape-matching that aligns tables by header names (instead of positional
/// index) is a follow-up; for now dimensions-only is the v1 default.
fn tables_same_shape(a: &Table, b: &Table) -> bool {
if a.cells.len() != b.cells.len() {
return false;
}
let a_cols = a.cells.first().map_or(0, Vec::len);
let b_cols = b.cells.first().map_or(0, Vec::len);
a_cols == b_cols
}
fn diff_cells(a: &Table, b: &Table) -> Vec<CellChange> {
let mut changes = Vec::new();
for (row_idx, (a_row, b_row)) in a.cells.iter().zip(b.cells.iter()).enumerate() {
for (col_idx, (a_cell, b_cell)) in a_row.iter().zip(b_row.iter()).enumerate() {
if a_cell != b_cell {
changes.push(CellChange {
row: row_idx,
col: col_idx,
from: a_cell.clone(),
to: b_cell.clone(),
});
}
}
}
changes
}
// ── Metadata diff ─────────────────────────────────────────────────────────────
fn diff_metadata(a: &crate::types::metadata::Metadata, b: &crate::types::metadata::Metadata) -> serde_json::Value {
let a_val = serde_json::to_value(a).unwrap_or(serde_json::Value::Null);
let b_val = serde_json::to_value(b).unwrap_or(serde_json::Value::Null);
let a_obj = a_val.as_object().cloned().unwrap_or_default();
let b_obj = b_val.as_object().cloned().unwrap_or_default();
let mut added = serde_json::Map::new();
let mut removed = serde_json::Map::new();
let mut changed = serde_json::Map::new();
for (key, b_value) in &b_obj {
match a_obj.get(key) {
None => {
added.insert(key.clone(), b_value.clone());
}
Some(a_value) if a_value != b_value => {
changed.insert(key.clone(), serde_json::json!({ "from": a_value, "to": b_value }));
}
_ => {}
}
}
for (key, a_value) in &a_obj {
if !b_obj.contains_key(key) {
removed.insert(key.clone(), a_value.clone());
}
}
serde_json::json!({ "added": added, "removed": removed, "changed": changed })
}
// ── Embedded diff ─────────────────────────────────────────────────────────────
fn diff_embedded(
a_children: Option<&[ArchiveEntry]>,
b_children: Option<&[ArchiveEntry]>,
opts: &DiffOptions,
) -> EmbeddedChanges {
let a_entries = a_children.unwrap_or(&[]);
let b_entries = b_children.unwrap_or(&[]);
let mut added = Vec::new();
let mut removed = Vec::new();
let mut changed = Vec::new();
for b_entry in b_entries {
match a_entries.iter().find(|e| e.path == b_entry.path) {
None => added.push(b_entry.clone()),
Some(a_entry) => {
let child_diff = compare(&a_entry.result, &b_entry.result, opts);
if is_nonempty_diff(&child_diff) {
changed.push(EmbeddedDiff {
path: b_entry.path.clone(),
diff: Box::new(child_diff),
});
}
}
}
}
for a_entry in a_entries {
if !b_entries.iter().any(|e| e.path == a_entry.path) {
removed.push(a_entry.clone());
}
}
EmbeddedChanges {
added,
removed,
changed,
}
}
fn is_nonempty_diff(diff: &ExtractionDiff) -> bool {
!diff.content_diff.is_empty()
|| !diff.tables_added.is_empty()
|| !diff.tables_removed.is_empty()
|| !diff.tables_changed.is_empty()
|| !diff.embedded_changes.added.is_empty()
|| !diff.embedded_changes.removed.is_empty()
|| !diff.embedded_changes.changed.is_empty()
|| is_nonempty_metadata_diff(&diff.metadata_changed)
}
fn is_nonempty_metadata_diff(val: &serde_json::Value) -> bool {
if val.is_null() {
return false;
}
let empty_obj = serde_json::json!({ "added": {}, "removed": {}, "changed": {} });
val != &empty_obj
}
// ── Tests ─────────────────────────────────────────────────────────────────────
#[cfg(all(test, feature = "diff"))]
mod tests {
use super::*;
use crate::types::{extraction::ExtractionResult, tables::Table};
fn empty_result() -> ExtractionResult {
ExtractionResult::default()
}
fn result_with_content(content: &str) -> ExtractionResult {
ExtractionResult {
content: content.to_string(),
..Default::default()
}
}
fn result_with_tables(tables: Vec<Table>) -> ExtractionResult {
ExtractionResult {
tables,
..Default::default()
}
}
fn simple_table(cells: Vec<Vec<&str>>) -> Table {
Table {
cells: cells
.into_iter()
.map(|row| row.into_iter().map(str::to_string).collect())
.collect(),
markdown: String::new(),
page_number: 1,
bounding_box: None,
}
}
// ── identical inputs ──────────────────────────────────────────────────────
#[test]
fn should_produce_empty_diff_for_identical_inputs() {
let a = empty_result();
let b = empty_result();
let diff = compare(&a, &b, &DiffOptions::default());
assert!(diff.content_diff.is_empty());
assert!(diff.tables_added.is_empty());
assert!(diff.tables_removed.is_empty());
assert!(diff.tables_changed.is_empty());
assert!(diff.embedded_changes.added.is_empty());
assert!(diff.embedded_changes.removed.is_empty());
assert!(diff.embedded_changes.changed.is_empty());
}
#[test]
fn should_produce_empty_diff_for_both_empty_results() {
let diff = compare(
&ExtractionResult::default(),
&ExtractionResult::default(),
&DiffOptions::default(),
);
assert!(!is_nonempty_diff(&diff));
}
// ── content diff ─────────────────────────────────────────────────────────
#[test]
fn should_produce_one_hunk_for_single_line_change() {
let a = result_with_content("Hello world");
let b = result_with_content("Hello Rust");
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.content_diff.len(), 1, "expected exactly one hunk");
let hunk = &diff.content_diff[0];
let has_removed = hunk
.lines
.iter()
.any(|l| matches!(l, DiffLine::Removed(t) if t == "Hello world"));
let has_added = hunk
.lines
.iter()
.any(|l| matches!(l, DiffLine::Added(t) if t == "Hello Rust"));
assert!(has_removed, "expected 'Hello world' as Removed line");
assert!(has_added, "expected 'Hello Rust' as Added line");
}
#[test]
fn should_report_correct_line_numbers_for_single_line_change() {
let a = result_with_content("line one\nline two\nline three");
let b = result_with_content("line one\nline TWO\nline three");
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.content_diff.len(), 1);
let hunk = &diff.content_diff[0];
// With 3-line context the hunk expands to include surrounding lines.
// Three-line text with change at line 1 (0-indexed): context pulls the
// hunk start back to line 0 (beginning of file).
assert_eq!(hunk.from_line, 0);
assert_eq!(hunk.to_line, 0);
// All 3 lines appear: one context, one changed, one context.
assert_eq!(hunk.from_count, 3);
assert_eq!(hunk.to_count, 3);
// The hunk must contain the changed lines.
let has_removed = hunk
.lines
.iter()
.any(|l| matches!(l, DiffLine::Removed(t) if t == "line two"));
let has_added = hunk
.lines
.iter()
.any(|l| matches!(l, DiffLine::Added(t) if t == "line TWO"));
assert!(has_removed, "expected 'line two' as Removed line");
assert!(has_added, "expected 'line TWO' as Added line");
}
#[test]
fn should_produce_empty_content_diff_when_content_identical_but_tables_differ() {
let mut a = result_with_tables(vec![simple_table(vec![vec!["A", "B"]])]);
a.content = "same text".to_string();
let mut b = result_with_tables(vec![simple_table(vec![vec!["A", "C"]])]);
b.content = "same text".to_string();
let diff = compare(&a, &b, &DiffOptions::default());
assert!(
diff.content_diff.is_empty(),
"content is identical; no content hunks expected"
);
assert!(!diff.tables_changed.is_empty(), "table change expected");
}
// ── table diff ───────────────────────────────────────────────────────────
#[test]
fn should_detect_single_cell_change_in_same_table() {
let a = result_with_tables(vec![simple_table(vec![vec!["A", "B"], vec!["C", "D"]])]);
let b = result_with_tables(vec![simple_table(vec![vec!["A", "B"], vec!["C", "X"]])]);
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.tables_changed.len(), 1);
let table_diff = &diff.tables_changed[0];
assert_eq!(table_diff.cell_changes.len(), 1);
let change = &table_diff.cell_changes[0];
assert_eq!(change.row, 1);
assert_eq!(change.col, 1);
assert_eq!(change.from, "D");
assert_eq!(change.to, "X");
}
#[test]
fn should_put_extra_table_in_tables_added() {
let a = result_with_tables(vec![simple_table(vec![vec!["A"]])]);
let b = result_with_tables(vec![simple_table(vec![vec!["A"]]), simple_table(vec![vec!["NEW"]])]);
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.tables_added.len(), 1);
assert_eq!(diff.tables_added[0].cells[0][0], "NEW");
assert!(diff.tables_removed.is_empty());
}
#[test]
fn should_put_missing_table_in_tables_removed() {
let a = result_with_tables(vec![simple_table(vec![vec!["A"]]), simple_table(vec![vec!["OLD"]])]);
let b = result_with_tables(vec![simple_table(vec![vec!["A"]])]);
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.tables_removed.len(), 1);
assert_eq!(diff.tables_removed[0].cells[0][0], "OLD");
assert!(diff.tables_added.is_empty());
}
// ── embedded diff ─────────────────────────────────────────────────────────
#[test]
fn should_detect_added_embedded_child() {
let a = empty_result();
let mut b = empty_result();
b.children = Some(vec![ArchiveEntry {
path: "doc.txt".to_string(),
mime_type: "text/plain".to_string(),
result: Box::new(result_with_content("hello")),
}]);
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.embedded_changes.added.len(), 1);
assert_eq!(diff.embedded_changes.added[0].path, "doc.txt");
assert!(diff.embedded_changes.removed.is_empty());
}
#[test]
fn should_detect_removed_embedded_child() {
let mut a = empty_result();
a.children = Some(vec![ArchiveEntry {
path: "old.txt".to_string(),
mime_type: "text/plain".to_string(),
result: Box::new(result_with_content("old")),
}]);
let b = empty_result();
let diff = compare(&a, &b, &DiffOptions::default());
assert_eq!(diff.embedded_changes.removed.len(), 1);
assert_eq!(diff.embedded_changes.removed[0].path, "old.txt");
assert!(diff.embedded_changes.added.is_empty());
}
}