This commit is contained in:
542
crates/kreuzberg/src/diff/mod.rs
Normal file
542
crates/kreuzberg/src/diff/mod.rs
Normal file
@@ -0,0 +1,542 @@
|
||||
//! Diff two [`ExtractionResult`] values.
|
||||
//!
|
||||
//! This module is gated behind the `diff` Cargo feature. Enable it by adding
|
||||
//! `kreuzberg = { features = ["diff"] }` to your `Cargo.toml`.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
|
||||
//!
|
||||
//! # fn main() {
|
||||
//! let a = ExtractionResult::default();
|
||||
//! let b = ExtractionResult::default();
|
||||
//! let opts = DiffOptions::default();
|
||||
//! let result = compare(&a, &b, &opts);
|
||||
//! assert!(result.content_diff.is_empty());
|
||||
//! # }
|
||||
//! ```
|
||||
|
||||
pub mod types;
|
||||
|
||||
pub use types::{
|
||||
CellChange, DiffHunk, DiffLine, DiffOptions, EmbeddedChanges, EmbeddedDiff, ExtractionDiff, TableDiff,
|
||||
};
|
||||
|
||||
use similar::{ChangeTag, DiffOp, TextDiff};
|
||||
|
||||
use crate::types::extraction::{ArchiveEntry, ExtractionResult};
|
||||
use crate::types::tables::Table;
|
||||
|
||||
/// Default number of context lines on each side of a changed region.
|
||||
const CONTEXT_LINES: usize = 3;
|
||||
|
||||
/// Compare two extraction results and return a structured diff.
|
||||
///
|
||||
/// The comparison is purely structural — no I/O, no side effects. All fields
|
||||
/// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `a` — the "before" extraction result
|
||||
/// * `b` — the "after" extraction result
|
||||
/// * `opts` — controls which sections are compared and optional truncation
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,no_run
|
||||
/// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
|
||||
///
|
||||
/// # fn main() {
|
||||
/// let mut a = ExtractionResult::default();
|
||||
/// let mut b = ExtractionResult::default();
|
||||
/// a.content = "Hello world".to_string();
|
||||
/// b.content = "Hello Rust".to_string();
|
||||
///
|
||||
/// let diff = compare(&a, &b, &DiffOptions::default());
|
||||
/// assert_eq!(diff.content_diff.len(), 1);
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn compare(a: &ExtractionResult, b: &ExtractionResult, opts: &DiffOptions) -> ExtractionDiff {
|
||||
let content_diff = diff_content(&a.content, &b.content, opts);
|
||||
let (tables_added, tables_removed, tables_changed) = diff_tables(&a.tables, &b.tables);
|
||||
let metadata_changed = if opts.include_metadata {
|
||||
diff_metadata(&a.metadata, &b.metadata)
|
||||
} else {
|
||||
serde_json::Value::Null
|
||||
};
|
||||
let embedded_changes = if opts.include_embedded {
|
||||
diff_embedded(a.children.as_deref(), b.children.as_deref(), opts)
|
||||
} else {
|
||||
EmbeddedChanges {
|
||||
added: vec![],
|
||||
removed: vec![],
|
||||
changed: vec![],
|
||||
}
|
||||
};
|
||||
|
||||
ExtractionDiff {
|
||||
content_diff,
|
||||
tables_added,
|
||||
tables_removed,
|
||||
tables_changed,
|
||||
metadata_changed,
|
||||
embedded_changes,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Content diff ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn diff_content(a: &str, b: &str, opts: &DiffOptions) -> Vec<DiffHunk> {
|
||||
let a_text = apply_truncation(a, opts.max_content_chars);
|
||||
let b_text = apply_truncation(b, opts.max_content_chars);
|
||||
let a_ref: &str = a_text.as_deref().unwrap_or(a);
|
||||
let b_ref: &str = b_text.as_deref().unwrap_or(b);
|
||||
|
||||
let text_diff = TextDiff::from_lines(a_ref, b_ref);
|
||||
|
||||
if text_diff.ratio() == 1.0 {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut hunks = Vec::new();
|
||||
for group in text_diff.grouped_ops(CONTEXT_LINES) {
|
||||
let hunk_from_line = hunk_old_start(&group);
|
||||
let hunk_to_line = hunk_new_start(&group);
|
||||
let hunk_from_count = hunk_old_len(&group);
|
||||
let hunk_to_count = hunk_new_len(&group);
|
||||
let mut lines = Vec::new();
|
||||
|
||||
for op in &group {
|
||||
for change in text_diff.iter_changes(op) {
|
||||
let text = change.value().trim_end_matches('\n').to_string();
|
||||
let line = match change.tag() {
|
||||
ChangeTag::Equal => DiffLine::Context(text),
|
||||
ChangeTag::Insert => DiffLine::Added(text),
|
||||
ChangeTag::Delete => DiffLine::Removed(text),
|
||||
};
|
||||
lines.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if !lines.is_empty() {
|
||||
hunks.push(DiffHunk {
|
||||
from_line: hunk_from_line,
|
||||
from_count: hunk_from_count,
|
||||
to_line: hunk_to_line,
|
||||
to_count: hunk_to_count,
|
||||
lines,
|
||||
});
|
||||
}
|
||||
}
|
||||
hunks
|
||||
}
|
||||
|
||||
fn hunk_old_start(ops: &[DiffOp]) -> usize {
|
||||
ops.first().map_or(0, |op| op.old_range().start)
|
||||
}
|
||||
|
||||
fn hunk_new_start(ops: &[DiffOp]) -> usize {
|
||||
ops.first().map_or(0, |op| op.new_range().start)
|
||||
}
|
||||
|
||||
fn hunk_old_len(ops: &[DiffOp]) -> usize {
|
||||
let start = ops.first().map_or(0, |op| op.old_range().start);
|
||||
let end = ops.last().map_or(0, |op| op.old_range().end);
|
||||
end.saturating_sub(start)
|
||||
}
|
||||
|
||||
fn hunk_new_len(ops: &[DiffOp]) -> usize {
|
||||
let start = ops.first().map_or(0, |op| op.new_range().start);
|
||||
let end = ops.last().map_or(0, |op| op.new_range().end);
|
||||
end.saturating_sub(start)
|
||||
}
|
||||
|
||||
fn apply_truncation(text: &str, limit: Option<usize>) -> Option<String> {
|
||||
limit.map(|n| {
|
||||
let mut boundary = n.min(text.len());
|
||||
while !text.is_char_boundary(boundary) {
|
||||
boundary -= 1;
|
||||
}
|
||||
text[..boundary].to_string()
|
||||
})
|
||||
}
|
||||
|
||||
// ── Table diff ────────────────────────────────────────────────────────────────
|
||||
|
||||
fn diff_tables(a_tables: &[Table], b_tables: &[Table]) -> (Vec<Table>, Vec<Table>, Vec<TableDiff>) {
|
||||
let min_len = a_tables.len().min(b_tables.len());
|
||||
let mut tables_changed = Vec::new();
|
||||
|
||||
for idx in 0..min_len {
|
||||
let a_t = &a_tables[idx];
|
||||
let b_t = &b_tables[idx];
|
||||
|
||||
if tables_same_shape(a_t, b_t) {
|
||||
let cell_changes = diff_cells(a_t, b_t);
|
||||
if !cell_changes.is_empty() {
|
||||
tables_changed.push(TableDiff {
|
||||
from_index: idx,
|
||||
to_index: idx,
|
||||
cell_changes,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Different shape — treat the pair as remove + add.
|
||||
// The "removed" side is reported in tables_removed and "added" in tables_added.
|
||||
// We handle this by falling through to the asymmetric slice handling below.
|
||||
// But we need to signal that these shouldn't be counted as "paired" — so we
|
||||
// emit them as add + remove even though they share the same index.
|
||||
tables_changed.push(TableDiff {
|
||||
from_index: idx,
|
||||
to_index: idx,
|
||||
// No cell-level changes: shapes differ; report as a structural replacement.
|
||||
cell_changes: vec![],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let tables_removed: Vec<Table> = if a_tables.len() > b_tables.len() {
|
||||
a_tables[min_len..].to_vec()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let tables_added: Vec<Table> = if b_tables.len() > a_tables.len() {
|
||||
b_tables[min_len..].to_vec()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
(tables_added, tables_removed, tables_changed)
|
||||
}
|
||||
|
||||
/// Two tables are considered the same shape if and only if their row and column counts match.
|
||||
///
|
||||
/// Header content is NOT compared — column reordering with the same dimensions will produce
|
||||
/// per-cell `CellChange` entries for every cell whose value differs, not a structural replacement.
|
||||
///
|
||||
/// TODO: smarter shape-matching that aligns tables by header names (instead of positional
|
||||
/// index) is a follow-up; for now dimensions-only is the v1 default.
|
||||
fn tables_same_shape(a: &Table, b: &Table) -> bool {
|
||||
if a.cells.len() != b.cells.len() {
|
||||
return false;
|
||||
}
|
||||
let a_cols = a.cells.first().map_or(0, Vec::len);
|
||||
let b_cols = b.cells.first().map_or(0, Vec::len);
|
||||
a_cols == b_cols
|
||||
}
|
||||
|
||||
fn diff_cells(a: &Table, b: &Table) -> Vec<CellChange> {
|
||||
let mut changes = Vec::new();
|
||||
for (row_idx, (a_row, b_row)) in a.cells.iter().zip(b.cells.iter()).enumerate() {
|
||||
for (col_idx, (a_cell, b_cell)) in a_row.iter().zip(b_row.iter()).enumerate() {
|
||||
if a_cell != b_cell {
|
||||
changes.push(CellChange {
|
||||
row: row_idx,
|
||||
col: col_idx,
|
||||
from: a_cell.clone(),
|
||||
to: b_cell.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
changes
|
||||
}
|
||||
|
||||
// ── Metadata diff ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn diff_metadata(a: &crate::types::metadata::Metadata, b: &crate::types::metadata::Metadata) -> serde_json::Value {
|
||||
let a_val = serde_json::to_value(a).unwrap_or(serde_json::Value::Null);
|
||||
let b_val = serde_json::to_value(b).unwrap_or(serde_json::Value::Null);
|
||||
|
||||
let a_obj = a_val.as_object().cloned().unwrap_or_default();
|
||||
let b_obj = b_val.as_object().cloned().unwrap_or_default();
|
||||
|
||||
let mut added = serde_json::Map::new();
|
||||
let mut removed = serde_json::Map::new();
|
||||
let mut changed = serde_json::Map::new();
|
||||
|
||||
for (key, b_value) in &b_obj {
|
||||
match a_obj.get(key) {
|
||||
None => {
|
||||
added.insert(key.clone(), b_value.clone());
|
||||
}
|
||||
Some(a_value) if a_value != b_value => {
|
||||
changed.insert(key.clone(), serde_json::json!({ "from": a_value, "to": b_value }));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
for (key, a_value) in &a_obj {
|
||||
if !b_obj.contains_key(key) {
|
||||
removed.insert(key.clone(), a_value.clone());
|
||||
}
|
||||
}
|
||||
|
||||
serde_json::json!({ "added": added, "removed": removed, "changed": changed })
|
||||
}
|
||||
|
||||
// ── Embedded diff ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn diff_embedded(
|
||||
a_children: Option<&[ArchiveEntry]>,
|
||||
b_children: Option<&[ArchiveEntry]>,
|
||||
opts: &DiffOptions,
|
||||
) -> EmbeddedChanges {
|
||||
let a_entries = a_children.unwrap_or(&[]);
|
||||
let b_entries = b_children.unwrap_or(&[]);
|
||||
|
||||
let mut added = Vec::new();
|
||||
let mut removed = Vec::new();
|
||||
let mut changed = Vec::new();
|
||||
|
||||
for b_entry in b_entries {
|
||||
match a_entries.iter().find(|e| e.path == b_entry.path) {
|
||||
None => added.push(b_entry.clone()),
|
||||
Some(a_entry) => {
|
||||
let child_diff = compare(&a_entry.result, &b_entry.result, opts);
|
||||
if is_nonempty_diff(&child_diff) {
|
||||
changed.push(EmbeddedDiff {
|
||||
path: b_entry.path.clone(),
|
||||
diff: Box::new(child_diff),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for a_entry in a_entries {
|
||||
if !b_entries.iter().any(|e| e.path == a_entry.path) {
|
||||
removed.push(a_entry.clone());
|
||||
}
|
||||
}
|
||||
|
||||
EmbeddedChanges {
|
||||
added,
|
||||
removed,
|
||||
changed,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_nonempty_diff(diff: &ExtractionDiff) -> bool {
|
||||
!diff.content_diff.is_empty()
|
||||
|| !diff.tables_added.is_empty()
|
||||
|| !diff.tables_removed.is_empty()
|
||||
|| !diff.tables_changed.is_empty()
|
||||
|| !diff.embedded_changes.added.is_empty()
|
||||
|| !diff.embedded_changes.removed.is_empty()
|
||||
|| !diff.embedded_changes.changed.is_empty()
|
||||
|| is_nonempty_metadata_diff(&diff.metadata_changed)
|
||||
}
|
||||
|
||||
fn is_nonempty_metadata_diff(val: &serde_json::Value) -> bool {
|
||||
if val.is_null() {
|
||||
return false;
|
||||
}
|
||||
let empty_obj = serde_json::json!({ "added": {}, "removed": {}, "changed": {} });
|
||||
val != &empty_obj
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(all(test, feature = "diff"))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::{extraction::ExtractionResult, tables::Table};
|
||||
|
||||
fn empty_result() -> ExtractionResult {
|
||||
ExtractionResult::default()
|
||||
}
|
||||
|
||||
fn result_with_content(content: &str) -> ExtractionResult {
|
||||
ExtractionResult {
|
||||
content: content.to_string(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn result_with_tables(tables: Vec<Table>) -> ExtractionResult {
|
||||
ExtractionResult {
|
||||
tables,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn simple_table(cells: Vec<Vec<&str>>) -> Table {
|
||||
Table {
|
||||
cells: cells
|
||||
.into_iter()
|
||||
.map(|row| row.into_iter().map(str::to_string).collect())
|
||||
.collect(),
|
||||
markdown: String::new(),
|
||||
page_number: 1,
|
||||
bounding_box: None,
|
||||
}
|
||||
}
|
||||
|
||||
// ── identical inputs ──────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn should_produce_empty_diff_for_identical_inputs() {
|
||||
let a = empty_result();
|
||||
let b = empty_result();
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert!(diff.content_diff.is_empty());
|
||||
assert!(diff.tables_added.is_empty());
|
||||
assert!(diff.tables_removed.is_empty());
|
||||
assert!(diff.tables_changed.is_empty());
|
||||
assert!(diff.embedded_changes.added.is_empty());
|
||||
assert!(diff.embedded_changes.removed.is_empty());
|
||||
assert!(diff.embedded_changes.changed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_produce_empty_diff_for_both_empty_results() {
|
||||
let diff = compare(
|
||||
&ExtractionResult::default(),
|
||||
&ExtractionResult::default(),
|
||||
&DiffOptions::default(),
|
||||
);
|
||||
assert!(!is_nonempty_diff(&diff));
|
||||
}
|
||||
|
||||
// ── content diff ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn should_produce_one_hunk_for_single_line_change() {
|
||||
let a = result_with_content("Hello world");
|
||||
let b = result_with_content("Hello Rust");
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert_eq!(diff.content_diff.len(), 1, "expected exactly one hunk");
|
||||
let hunk = &diff.content_diff[0];
|
||||
let has_removed = hunk
|
||||
.lines
|
||||
.iter()
|
||||
.any(|l| matches!(l, DiffLine::Removed(t) if t == "Hello world"));
|
||||
let has_added = hunk
|
||||
.lines
|
||||
.iter()
|
||||
.any(|l| matches!(l, DiffLine::Added(t) if t == "Hello Rust"));
|
||||
assert!(has_removed, "expected 'Hello world' as Removed line");
|
||||
assert!(has_added, "expected 'Hello Rust' as Added line");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_report_correct_line_numbers_for_single_line_change() {
|
||||
let a = result_with_content("line one\nline two\nline three");
|
||||
let b = result_with_content("line one\nline TWO\nline three");
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert_eq!(diff.content_diff.len(), 1);
|
||||
let hunk = &diff.content_diff[0];
|
||||
// With 3-line context the hunk expands to include surrounding lines.
|
||||
// Three-line text with change at line 1 (0-indexed): context pulls the
|
||||
// hunk start back to line 0 (beginning of file).
|
||||
assert_eq!(hunk.from_line, 0);
|
||||
assert_eq!(hunk.to_line, 0);
|
||||
// All 3 lines appear: one context, one changed, one context.
|
||||
assert_eq!(hunk.from_count, 3);
|
||||
assert_eq!(hunk.to_count, 3);
|
||||
// The hunk must contain the changed lines.
|
||||
let has_removed = hunk
|
||||
.lines
|
||||
.iter()
|
||||
.any(|l| matches!(l, DiffLine::Removed(t) if t == "line two"));
|
||||
let has_added = hunk
|
||||
.lines
|
||||
.iter()
|
||||
.any(|l| matches!(l, DiffLine::Added(t) if t == "line TWO"));
|
||||
assert!(has_removed, "expected 'line two' as Removed line");
|
||||
assert!(has_added, "expected 'line TWO' as Added line");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_produce_empty_content_diff_when_content_identical_but_tables_differ() {
|
||||
let mut a = result_with_tables(vec![simple_table(vec![vec!["A", "B"]])]);
|
||||
a.content = "same text".to_string();
|
||||
let mut b = result_with_tables(vec![simple_table(vec![vec!["A", "C"]])]);
|
||||
b.content = "same text".to_string();
|
||||
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
assert!(
|
||||
diff.content_diff.is_empty(),
|
||||
"content is identical; no content hunks expected"
|
||||
);
|
||||
assert!(!diff.tables_changed.is_empty(), "table change expected");
|
||||
}
|
||||
|
||||
// ── table diff ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn should_detect_single_cell_change_in_same_table() {
|
||||
let a = result_with_tables(vec![simple_table(vec![vec!["A", "B"], vec!["C", "D"]])]);
|
||||
let b = result_with_tables(vec![simple_table(vec![vec!["A", "B"], vec!["C", "X"]])]);
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert_eq!(diff.tables_changed.len(), 1);
|
||||
let table_diff = &diff.tables_changed[0];
|
||||
assert_eq!(table_diff.cell_changes.len(), 1);
|
||||
let change = &table_diff.cell_changes[0];
|
||||
assert_eq!(change.row, 1);
|
||||
assert_eq!(change.col, 1);
|
||||
assert_eq!(change.from, "D");
|
||||
assert_eq!(change.to, "X");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_put_extra_table_in_tables_added() {
|
||||
let a = result_with_tables(vec![simple_table(vec![vec!["A"]])]);
|
||||
let b = result_with_tables(vec![simple_table(vec![vec!["A"]]), simple_table(vec![vec!["NEW"]])]);
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert_eq!(diff.tables_added.len(), 1);
|
||||
assert_eq!(diff.tables_added[0].cells[0][0], "NEW");
|
||||
assert!(diff.tables_removed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_put_missing_table_in_tables_removed() {
|
||||
let a = result_with_tables(vec![simple_table(vec![vec!["A"]]), simple_table(vec![vec!["OLD"]])]);
|
||||
let b = result_with_tables(vec![simple_table(vec![vec!["A"]])]);
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
|
||||
assert_eq!(diff.tables_removed.len(), 1);
|
||||
assert_eq!(diff.tables_removed[0].cells[0][0], "OLD");
|
||||
assert!(diff.tables_added.is_empty());
|
||||
}
|
||||
|
||||
// ── embedded diff ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn should_detect_added_embedded_child() {
|
||||
let a = empty_result();
|
||||
let mut b = empty_result();
|
||||
b.children = Some(vec![ArchiveEntry {
|
||||
path: "doc.txt".to_string(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
result: Box::new(result_with_content("hello")),
|
||||
}]);
|
||||
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
assert_eq!(diff.embedded_changes.added.len(), 1);
|
||||
assert_eq!(diff.embedded_changes.added[0].path, "doc.txt");
|
||||
assert!(diff.embedded_changes.removed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_detect_removed_embedded_child() {
|
||||
let mut a = empty_result();
|
||||
a.children = Some(vec![ArchiveEntry {
|
||||
path: "old.txt".to_string(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
result: Box::new(result_with_content("old")),
|
||||
}]);
|
||||
let b = empty_result();
|
||||
|
||||
let diff = compare(&a, &b, &DiffOptions::default());
|
||||
assert_eq!(diff.embedded_changes.removed.len(), 1);
|
||||
assert_eq!(diff.embedded_changes.removed[0].path, "old.txt");
|
||||
assert!(diff.embedded_changes.added.is_empty());
|
||||
}
|
||||
}
|
||||
127
crates/kreuzberg/src/diff/types.rs
Normal file
127
crates/kreuzberg/src/diff/types.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
//! Types for extraction result diffs.
|
||||
//!
|
||||
//! `DiffLine` and `CellChange` are canonical definitions live in
|
||||
//! `crate::types::revisions` so that `RevisionDelta` can reference them
|
||||
//! unconditionally without the `diff` feature gate. They are re-exported
|
||||
//! here so the `crate::diff::DiffLine` path continues to work for callers
|
||||
//! who import them through the `diff` feature.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::types::{extraction::ArchiveEntry, tables::Table};
|
||||
|
||||
// Re-export from the unconditional types module so the `diff` feature's
|
||||
// public path (`kreuzberg::diff::DiffLine`, `kreuzberg::diff::CellChange`)
|
||||
// remains stable. The canonical definitions are in `crate::types::revisions`.
|
||||
pub use crate::types::revisions::{CellChange, DiffLine};
|
||||
|
||||
/// Options controlling how two `ExtractionResult` values are compared.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct DiffOptions {
|
||||
/// Include metadata changes in the diff. Default: `true`.
|
||||
pub include_metadata: bool,
|
||||
/// Include embedded-children changes in the diff. Default: `true`.
|
||||
pub include_embedded: bool,
|
||||
/// Truncate content to this many characters before diffing.
|
||||
///
|
||||
/// Useful for very large documents where only the first N characters matter.
|
||||
/// `None` means no truncation.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub max_content_chars: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for DiffOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
include_metadata: true,
|
||||
include_embedded: true,
|
||||
max_content_chars: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The complete diff between two `ExtractionResult` values.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct ExtractionDiff {
|
||||
/// Unified-diff hunks for the `content` field.
|
||||
///
|
||||
/// Empty when the content is identical.
|
||||
pub content_diff: Vec<DiffHunk>,
|
||||
|
||||
/// Tables present in `b` but not in `a` (by index position, excess right-side tables).
|
||||
pub tables_added: Vec<Table>,
|
||||
|
||||
/// Tables present in `a` but not in `b` (by index position, excess left-side tables).
|
||||
pub tables_removed: Vec<Table>,
|
||||
|
||||
/// Cell-level changes for table pairs that share the same index and dimensions.
|
||||
pub tables_changed: Vec<TableDiff>,
|
||||
|
||||
/// Metadata difference, encoded as a JSON object with three top-level keys:
|
||||
/// `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
|
||||
/// but not `b`), and `changed` (keys whose values differ — each entry is
|
||||
/// `{ "from": <value-in-a>, "to": <value-in-b> }`).
|
||||
///
|
||||
/// This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
|
||||
/// to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
|
||||
/// (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
|
||||
/// preferred json-patch impl directly.
|
||||
pub metadata_changed: serde_json::Value,
|
||||
|
||||
/// Changes to embedded archive children.
|
||||
pub embedded_changes: EmbeddedChanges,
|
||||
}
|
||||
|
||||
/// A single contiguous hunk in a unified diff.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct DiffHunk {
|
||||
/// Starting line number in the old content (0-indexed).
|
||||
pub from_line: usize,
|
||||
/// Number of lines from the old content in this hunk.
|
||||
pub from_count: usize,
|
||||
/// Starting line number in the new content (0-indexed).
|
||||
pub to_line: usize,
|
||||
/// Number of lines from the new content in this hunk.
|
||||
pub to_count: usize,
|
||||
/// Lines that make up this hunk.
|
||||
pub lines: Vec<DiffLine>,
|
||||
}
|
||||
|
||||
/// Cell-level changes for a pair of tables that share the same index.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct TableDiff {
|
||||
/// Zero-based index of the table in both `a.tables` and `b.tables`.
|
||||
pub from_index: usize,
|
||||
/// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
|
||||
pub to_index: usize,
|
||||
/// Cell-level changes within the table.
|
||||
pub cell_changes: Vec<CellChange>,
|
||||
}
|
||||
|
||||
/// Changes to embedded archive children between two results.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct EmbeddedChanges {
|
||||
/// Children present in `b` but not in `a` (matched by `path`).
|
||||
pub added: Vec<ArchiveEntry>,
|
||||
/// Children present in `a` but not in `b` (matched by `path`).
|
||||
pub removed: Vec<ArchiveEntry>,
|
||||
/// Children present in both but with differing content (matched by `path`).
|
||||
///
|
||||
/// Each entry holds the diff of the nested `ExtractionResult`.
|
||||
pub changed: Vec<EmbeddedDiff>,
|
||||
}
|
||||
|
||||
/// Diff for a single embedded archive entry that appears in both results.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
||||
pub struct EmbeddedDiff {
|
||||
/// Archive-relative path identifying this entry.
|
||||
pub path: String,
|
||||
/// The recursive diff of the entry's extraction result.
|
||||
pub diff: Box<ExtractionDiff>,
|
||||
}
|
||||
Reference in New Issue
Block a user