Detailed changes
@@ -172,6 +172,10 @@ pub struct ExampleScore {
pub wrong_editable_region: Option<bool>,
#[serde(default)]
pub has_isolated_whitespace_changes: bool,
+ #[serde(default)]
+ pub inserted_tokens: usize,
+ #[serde(default)]
+ pub deleted_tokens: usize,
}
impl Example {
@@ -3,6 +3,7 @@ use collections::HashMap;
use crate::{
example::ActualCursor,
reorder_patch::{Patch, PatchLine},
+ word_diff::{DiffOp, diff_tokens, tokenize},
};
pub type Counts = HashMap<String, usize>;
@@ -486,6 +487,91 @@ pub fn is_editable_region_correct(actual_patch: &str) -> bool {
true
}
+#[derive(Debug, Default, Clone)]
+pub struct TokenChangeCounts {
+ pub inserted_tokens: usize,
+ pub deleted_tokens: usize,
+}
+
+/// Counts the number of inserted and deleted tokens in a unified diff patch.
+///
+/// Tokens are words and whitespace sequences (as defined by `word_diff::tokenize`).
+/// Within each hunk, the old (`-`) and new (`+`) lines are compared at the token level
+/// using an LCS-based diff, so modified lines only count the actually changed tokens
+/// rather than the entire line.
+pub fn count_patch_token_changes(patch: &str) -> TokenChangeCounts {
+ let mut counts = TokenChangeCounts::default();
+ let mut old_lines: Vec<&str> = Vec::new();
+ let mut new_lines: Vec<&str> = Vec::new();
+
+ let flush =
+ |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, counts: &mut TokenChangeCounts| {
+ if old_lines.is_empty() && new_lines.is_empty() {
+ return;
+ }
+
+ let old_text: String = old_lines
+ .iter()
+ .map(|line| if line.len() > 1 { &line[1..] } else { "" })
+ .collect::<Vec<_>>()
+ .join("\n");
+
+ let new_text: String = new_lines
+ .iter()
+ .map(|line| if line.len() > 1 { &line[1..] } else { "" })
+ .collect::<Vec<_>>()
+ .join("\n");
+
+ let old_tokens = tokenize(&old_text);
+ let new_tokens = tokenize(&new_text);
+ let ops = diff_tokens(&old_tokens, &new_tokens);
+
+ for op in ops {
+ match op {
+ DiffOp::Equal(..) => {}
+ DiffOp::Delete(start, end) => {
+ counts.deleted_tokens += end - start;
+ }
+ DiffOp::Insert(start, end) => {
+ counts.inserted_tokens += end - start;
+ }
+ DiffOp::Replace {
+ old_start,
+ old_end,
+ new_start,
+ new_end,
+ } => {
+ counts.deleted_tokens += old_end - old_start;
+ counts.inserted_tokens += new_end - new_start;
+ }
+ }
+ }
+
+ old_lines.clear();
+ new_lines.clear();
+ };
+
+ for line in patch.lines() {
+ if line.starts_with("---")
+ || line.starts_with("+++")
+ || line.starts_with("@@")
+ || line.starts_with("diff ")
+ || line.starts_with("index ")
+ {
+ flush(&mut old_lines, &mut new_lines, &mut counts);
+ } else if line.starts_with('-') {
+ old_lines.push(line);
+ } else if line.starts_with('+') {
+ new_lines.push(line);
+ } else {
+ flush(&mut old_lines, &mut new_lines, &mut counts);
+ }
+ }
+
+ flush(&mut old_lines, &mut new_lines, &mut counts);
+ counts
+}
+
#[cfg(test)]
mod test_optimization {
use super::*;
@@ -977,4 +1063,116 @@ index abc123..def456 100644
let cursor = cursor_on_line(2);
assert!(has_isolated_whitespace_changes(patch, Some(&cursor)));
}
+
+ #[test]
+ fn test_count_patch_token_changes_real_world_rename() {
+ // Real-world patch that was reported as returning 0 tokens
+ let patch = "--- a/sip_call\\README.md\n+++ b/sip_call\\README.md\n@@ -1,1 +1,1 @@\n-# \n+# SIP Call\n";
+ let counts = count_patch_token_changes(patch);
+ // "# " vs "# SIP Call" — the "SIP" and "Call" tokens (and a whitespace token) are inserted
+ assert!(
+ counts.inserted_tokens > 0,
+ "expected inserted tokens > 0, got {}",
+ counts.inserted_tokens
+ );
+ assert_eq!(counts.deleted_tokens, 0);
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_real_world_expansion() {
+ // Real-world patch: single token expanded to multiple lines
+ let patch = "--- a/task1/src/app/app.html\n+++ b/task1/src/app/app.html\n@@ -1,7 +1,9 @@\n <style>\n- m\n+ main {\n+ \n+ }\n </style>\n \n <main>\n \n </main>\n";
+ let counts = count_patch_token_changes(patch);
+ assert!(
+ counts.inserted_tokens > 0,
+ "expected inserted tokens > 0, got {}",
+ counts.inserted_tokens
+ );
+ assert!(
+ counts.deleted_tokens > 0,
+ "expected deleted tokens > 0, got {}",
+ counts.deleted_tokens
+ );
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_simple_replacement() {
+ let patch = indoc! {"
+ @@ -1,3 +1,3 @@
+ fn main() {
+ - println!(\"hello\");
+ + println!(\"world\");
+ }
+ "};
+ let counts = count_patch_token_changes(patch);
+ assert_eq!(counts.deleted_tokens, 1, "deleted: \"hello\"");
+ assert_eq!(counts.inserted_tokens, 1, "inserted: \"world\"");
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_insertion_only() {
+ let patch = indoc! {"
+ @@ -1,2 +1,3 @@
+ fn main() {
+ + println!(\"hello\");
+ }
+ "};
+ let counts = count_patch_token_changes(patch);
+ assert_eq!(counts.deleted_tokens, 0);
+ assert!(counts.inserted_tokens > 0);
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_deletion_only() {
+ let patch = indoc! {"
+ @@ -1,3 +1,2 @@
+ fn main() {
+ - println!(\"hello\");
+ }
+ "};
+ let counts = count_patch_token_changes(patch);
+ assert!(counts.deleted_tokens > 0);
+ assert_eq!(counts.inserted_tokens, 0);
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_empty_patch() {
+ let patch = "";
+ let counts = count_patch_token_changes(patch);
+ assert_eq!(counts.deleted_tokens, 0);
+ assert_eq!(counts.inserted_tokens, 0);
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_multiple_hunks() {
+ let patch = indoc! {"
+ @@ -1,3 +1,3 @@
+ fn main() {
+ - let x = 1;
+ + let x = 2;
+ }
+ @@ -10,3 +10,3 @@
+ fn other() {
+ - let y = 3;
+ + let y = 4;
+ }
+ "};
+ let counts = count_patch_token_changes(patch);
+ assert_eq!(counts.deleted_tokens, 2, "deleted: \"1\" and \"3\"");
+ assert_eq!(counts.inserted_tokens, 2, "inserted: \"2\" and \"4\"");
+ }
+
+ #[test]
+ fn test_count_patch_token_changes_multiword_change() {
+ let patch = indoc! {"
+ @@ -1,1 +1,1 @@
+ -hello world foo
+ +hello bar baz
+ "};
+ let counts = count_patch_token_changes(patch);
+ // "world" and "foo" deleted, "bar" and "baz" inserted
+ // (whitespace tokens between them may also count)
+ assert!(counts.deleted_tokens >= 2);
+ assert!(counts.inserted_tokens >= 2);
+ }
}
@@ -76,6 +76,8 @@ pub async fn run_scoring(
cursor_exact_match: None,
wrong_editable_region: None,
has_isolated_whitespace_changes: false,
+ inserted_tokens: 0,
+ deleted_tokens: 0,
};
let prompt_inputs = example.prompt_inputs.as_ref().unwrap();
@@ -95,10 +97,15 @@ pub async fn run_scoring(
continue;
};
+ let token_changes = metrics::count_patch_token_changes(&actual_patch);
+
let actual_text = match apply_diff_to_string(&actual_patch, original_text) {
Ok(text) => text,
Err(_) => {
- scores.push(zero_scores.clone());
+ let mut s = zero_scores.clone();
+ s.inserted_tokens = token_changes.inserted_tokens;
+ s.deleted_tokens = token_changes.deleted_tokens;
+ scores.push(s);
continue;
}
};
@@ -181,6 +188,8 @@ pub async fn run_scoring(
cursor_exact_match,
wrong_editable_region,
has_isolated_whitespace_changes,
+ inserted_tokens: token_changes.inserted_tokens,
+ deleted_tokens: token_changes.deleted_tokens,
});
}
@@ -238,6 +247,9 @@ pub fn print_report(examples: &[Example]) {
let mut wrong_editable_region_count: usize = 0;
let mut wrong_editable_region_total: usize = 0;
let mut isolated_whitespace_count: usize = 0;
+ let mut patch_inserted_tokens: Vec<usize> = Vec::new();
+ let mut patch_deleted_tokens: Vec<usize> = Vec::new();
+ let mut predictions_with_patch: usize = 0;
for example in examples {
for (score_idx, score) in example.score.iter().enumerate() {
@@ -321,6 +333,18 @@ pub fn print_report(examples: &[Example]) {
isolated_whitespace_count += 1;
}
+ // Accumulate token change metrics (only for predictions that produced a patch)
+ let has_patch = example
+ .predictions
+ .get(score_idx)
+ .and_then(|p| p.actual_patch.as_ref())
+ .is_some_and(|p| !p.is_empty());
+ if has_patch {
+ predictions_with_patch += 1;
+ patch_inserted_tokens.push(score.inserted_tokens);
+ patch_deleted_tokens.push(score.deleted_tokens);
+ }
+
// Accumulate cursor metrics
if let Some(exact_match) = score.cursor_exact_match {
cursor_total += 1;
@@ -421,11 +445,70 @@ pub fn print_report(examples: &[Example]) {
if total_scores > 0 {
println!("Isolated whitespace changes: {}", isolated_ws_str);
}
+
+ // Print token change percentile summary (only for predictions with a patch)
+ if !patch_inserted_tokens.is_empty() {
+ patch_inserted_tokens.sort_unstable();
+ patch_deleted_tokens.sort_unstable();
+ let mut patch_total_tokens: Vec<usize> = patch_inserted_tokens
+ .iter()
+ .zip(patch_deleted_tokens.iter())
+ .map(|(i, d)| i + d)
+ .collect();
+ patch_total_tokens.sort_unstable();
+
+ let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0;
+ println!();
+ println!(
+ "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)",
+ predictions_with_patch, total_scores, patch_rate
+ );
+ println!(
+ "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
+ "", "p25", "p50", "p75", "p90", "p99"
+ );
+ println!("{}", "─".repeat(LINE_WIDTH));
+ println!(
+ "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
+ "Inserted tokens",
+ percentile(&patch_inserted_tokens, 25),
+ percentile(&patch_inserted_tokens, 50),
+ percentile(&patch_inserted_tokens, 75),
+ percentile(&patch_inserted_tokens, 90),
+ percentile(&patch_inserted_tokens, 99),
+ );
+ println!(
+ "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
+ "Deleted tokens",
+ percentile(&patch_deleted_tokens, 25),
+ percentile(&patch_deleted_tokens, 50),
+ percentile(&patch_deleted_tokens, 75),
+ percentile(&patch_deleted_tokens, 90),
+ percentile(&patch_deleted_tokens, 99),
+ );
+ println!(
+ "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}",
+ "Total tokens",
+ percentile(&patch_total_tokens, 25),
+ percentile(&patch_total_tokens, 50),
+ percentile(&patch_total_tokens, 75),
+ percentile(&patch_total_tokens, 90),
+ percentile(&patch_total_tokens, 99),
+ );
+ }
}
println!("\n");
}
+fn percentile(sorted_values: &[usize], p: usize) -> usize {
+ if sorted_values.is_empty() {
+ return 0;
+ }
+ let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize;
+ sorted_values[idx.min(sorted_values.len() - 1)]
+}
+
fn truncate_name(name: &str, max_len: usize) -> String {
if name.len() <= max_len {
name.to_string()
@@ -6,6 +6,7 @@
//! TODO: Port Python code to generate chronologically-ordered commits
use crate::FailedHandling;
use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
+use crate::word_diff::tokenize;
/// Find the largest valid UTF-8 char boundary at or before `index` in `s`.
fn floor_char_boundary(s: &str, index: usize) -> usize {
@@ -413,37 +414,6 @@ pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String)
(source_str, target_str)
}
-/// Tokenize text into words and non-word characters.
-fn tokenize(text: &str) -> Vec<String> {
- let mut tokens = Vec::new();
- let mut current = String::new();
-
- for ch in text.chars() {
- if ch.is_alphanumeric() {
- current.push(ch);
- } else if ch == '_' {
- // Include underscore with the current word, then flush
- current.push(ch);
- if !current.is_empty() {
- tokens.push(std::mem::take(&mut current));
- }
- } else {
- // Punctuation or whitespace - flush current word first
- if !current.is_empty() {
- tokens.push(std::mem::take(&mut current));
- }
- // Each punctuation/whitespace is its own token
- tokens.push(ch.to_string());
- }
- }
-
- if !current.is_empty() {
- tokens.push(current);
- }
-
- tokens
-}
-
/// Calculate the weight for a split position based on the character at that position.
///
/// Higher weights indicate more natural pause points (e.g., after punctuation,
@@ -647,12 +617,8 @@ pub fn imitate_human_edits(
let src_tokens = tokenize(&src_line);
let tgt_tokens = tokenize(&tgt_line);
- // Convert to slices for similar
- let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect();
- let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect();
-
// Use similar to get diff operations
- let diff = TextDiff::from_slices(&src_refs, &tgt_refs);
+ let diff = TextDiff::from_slices(&src_tokens, &tgt_tokens);
// Build weights for each possible split position
let mut position_weights: Vec<u32> = Vec::new();
@@ -665,12 +631,12 @@ pub fn imitate_human_edits(
match op.tag() {
DiffTag::Equal => {
for i in op.old_range() {
- current_text.push_str(&src_tokens[i]);
+ current_text.push_str(src_tokens[i]);
}
}
DiffTag::Replace => {
- let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
- let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+ let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
+ let del: String = op.old_range().map(|i| src_tokens[i]).collect();
// For insertion part
for ch in ins.chars() {
@@ -686,7 +652,7 @@ pub fn imitate_human_edits(
}
}
DiffTag::Insert => {
- let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+ let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
for ch in ins.chars() {
current_text.push(ch);
let weight = position_weight(¤t_text, current_text.len());
@@ -694,7 +660,7 @@ pub fn imitate_human_edits(
}
}
DiffTag::Delete => {
- let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+ let del: String = op.old_range().map(|i| src_tokens[i]).collect();
for _ in del.chars() {
// Weight deletions lower
position_weights.push(2);
@@ -719,14 +685,14 @@ pub fn imitate_human_edits(
match op.tag() {
DiffTag::Equal => {
for i in op.old_range() {
- new_src.push_str(&src_tokens[i]);
+ new_src.push_str(src_tokens[i]);
}
last_old_end = op.old_range().end;
}
DiffTag::Replace => {
// Handle replace as delete + insert
- let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
- let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+ let del: String = op.old_range().map(|i| src_tokens[i]).collect();
+ let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
let repl_len = del.len() + ins.len();
if edit_index + repl_len >= split_index {
// Split within this replace operation
@@ -750,7 +716,7 @@ pub fn imitate_human_edits(
}
}
DiffTag::Insert => {
- let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect();
+ let repl: String = op.new_range().map(|i| tgt_tokens[i]).collect();
if edit_index + repl.len() >= split_index {
let offset = split_index - edit_index;
let safe_offset = floor_char_boundary(&repl, offset);
@@ -763,7 +729,7 @@ pub fn imitate_human_edits(
}
}
DiffTag::Delete => {
- let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect();
+ let repl: String = op.old_range().map(|i| src_tokens[i]).collect();
if edit_index + repl.len() >= split_index {
let offset = split_index - edit_index;
let safe_offset = floor_char_boundary(&repl, offset);
@@ -797,10 +763,10 @@ pub fn imitate_human_edits(
// Add remainder of source if similar enough to target remainder
let remainder_src: String = (last_old_end..src_tokens.len())
- .map(|i| src_tokens[i].as_str())
+ .map(|i| src_tokens[i])
.collect();
let remainder_tgt: String = (last_old_end..tgt_tokens.len())
- .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str()))
+ .filter_map(|i| tgt_tokens.get(i).copied())
.collect();
let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt);
@@ -1104,13 +1070,13 @@ mod tests {
assert_eq!(tokens, vec!["hello", " ", "world"]);
let tokens = tokenize("foo_bar123 + baz");
- assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]);
+ assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
let tokens = tokenize("print(\"hello\")");
assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
let tokens = tokenize("hello_world");
- assert_eq!(tokens, vec!["hello_", "world"]);
+ assert_eq!(tokens, vec!["hello_world"]);
let tokens = tokenize("fn();");
assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
@@ -1,5 +1,7 @@
//! Word-diff utilities for converting unified diffs to word-diff format.
+use similar::{DiffTag, TextDiff};
+
/// Convert unified diff to word-diff format.
///
/// This transforms line-based diffs into word-level diffs where:
@@ -129,29 +131,38 @@ fn compute_word_diff(old_text: &str, new_text: &str) -> String {
result
}
-/// Tokenize text into words and whitespace sequences.
-fn tokenize(text: &str) -> Vec<&str> {
+/// Classify a character into one of three token classes:
+/// - 0: identifier (alphanumeric or `_`)
+/// - 1: whitespace
+/// - 2: punctuation (everything else, each character becomes its own token)
+fn char_class(ch: char) -> u8 {
+ if ch.is_alphanumeric() || ch == '_' {
+ 0
+ } else if ch.is_whitespace() {
+ 1
+ } else {
+ 2
+ }
+}
+
+/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters.
+///
+/// This splitting aligns with the syntactic atoms of source code so that the
+/// LCS-based diff can produce fine-grained, meaningful change regions.
+pub(crate) fn tokenize(text: &str) -> Vec<&str> {
let mut tokens = Vec::new();
let mut chars = text.char_indices().peekable();
while let Some((start, ch)) = chars.next() {
- if ch.is_whitespace() {
- // Collect contiguous whitespace
- let mut end = start + ch.len_utf8();
- while let Some(&(_, next_ch)) = chars.peek() {
- if next_ch.is_whitespace() {
- end += next_ch.len_utf8();
- chars.next();
- } else {
- break;
- }
- }
- tokens.push(&text[start..end]);
+ let class = char_class(ch);
+ if class == 2 {
+ // Punctuation: each character is a separate token
+ tokens.push(&text[start..start + ch.len_utf8()]);
} else {
- // Collect contiguous non-whitespace
+ // Identifier or whitespace: collect contiguous run of same class
let mut end = start + ch.len_utf8();
while let Some(&(_, next_ch)) = chars.peek() {
- if !next_ch.is_whitespace() {
+ if char_class(next_ch) == class {
end += next_ch.len_utf8();
chars.next();
} else {
@@ -166,7 +177,7 @@ fn tokenize(text: &str) -> Vec<&str> {
}
#[derive(Debug)]
-enum DiffOp {
+pub(crate) enum DiffOp {
Equal(usize, usize),
Delete(usize, usize),
Insert(usize, usize),
@@ -178,130 +189,28 @@ enum DiffOp {
},
}
-/// Compute diff operations between two token sequences using a simple LCS-based algorithm.
-fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
- // Build LCS table
- let m = old.len();
- let n = new.len();
-
- if m == 0 && n == 0 {
- return vec![];
- }
- if m == 0 {
- return vec![DiffOp::Insert(0, n)];
- }
- if n == 0 {
- return vec![DiffOp::Delete(0, m)];
- }
-
- // LCS dynamic programming
- let mut dp = vec![vec![0usize; n + 1]; m + 1];
- for i in 1..=m {
- for j in 1..=n {
- if old[i - 1] == new[j - 1] {
- dp[i][j] = dp[i - 1][j - 1] + 1;
- } else {
- dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]);
- }
- }
- }
-
- // Backtrack to find operations
- let mut ops = Vec::new();
- let mut i = m;
- let mut j = n;
-
- // We'll collect in reverse order, then reverse at the end
- let mut stack: Vec<(usize, usize, bool)> = Vec::new(); // (index, end, is_old)
-
- while i > 0 || j > 0 {
- if i > 0 && j > 0 && old[i - 1] == new[j - 1] {
- stack.push((i - 1, i, true)); // Equal marker (using old index)
- stack.push((j - 1, j, false)); // Paired with new index
- i -= 1;
- j -= 1;
- } else if j > 0 && (i == 0 || dp[i][j - 1] >= dp[i - 1][j]) {
- // Insert from new
- stack.push((j - 1, j, false));
- j -= 1;
- } else {
- // Delete from old
- stack.push((i - 1, i, true));
- i -= 1;
- }
- }
-
- // Process the stack to build proper DiffOps
- // This is a simplified approach - just iterate through and build ops
- let mut old_idx = 0;
- let mut new_idx = 0;
-
- while old_idx < m || new_idx < n {
- // Find next matching pair
- let mut old_match = None;
- let mut new_match = None;
-
- for oi in old_idx..m {
- for ni in new_idx..n {
- if old[oi] == new[ni] {
- old_match = Some(oi);
- new_match = Some(ni);
- break;
- }
- }
- if old_match.is_some() {
- break;
- }
- }
-
- match (old_match, new_match) {
- (Some(om), Some(nm)) => {
- // Handle any deletions/insertions before the match
- if old_idx < om && new_idx < nm {
- ops.push(DiffOp::Replace {
- old_start: old_idx,
- old_end: om,
- new_start: new_idx,
- new_end: nm,
- });
- } else if old_idx < om {
- ops.push(DiffOp::Delete(old_idx, om));
- } else if new_idx < nm {
- ops.push(DiffOp::Insert(new_idx, nm));
- }
-
- // Find the extent of the equal sequence
- let mut eq_end_old = om;
- let mut eq_end_new = nm;
- while eq_end_old < m && eq_end_new < n && old[eq_end_old] == new[eq_end_new] {
- eq_end_old += 1;
- eq_end_new += 1;
- }
-
- ops.push(DiffOp::Equal(om, eq_end_old));
- old_idx = eq_end_old;
- new_idx = eq_end_new;
- }
- _ => {
- // No more matches, handle remaining
- if old_idx < m && new_idx < n {
- ops.push(DiffOp::Replace {
- old_start: old_idx,
- old_end: m,
- new_start: new_idx,
- new_end: n,
- });
- } else if old_idx < m {
- ops.push(DiffOp::Delete(old_idx, m));
- } else if new_idx < n {
- ops.push(DiffOp::Insert(new_idx, n));
- }
- break;
+/// Compute diff operations between two token sequences using `similar`'s Myers diff.
+pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec<DiffOp> {
+ let diff = TextDiff::from_slices(old, new);
+ diff.ops()
+ .iter()
+ .map(|op| {
+ let tag = op.tag();
+ let old_range = op.old_range();
+ let new_range = op.new_range();
+ match tag {
+ DiffTag::Equal => DiffOp::Equal(old_range.start, old_range.end),
+ DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end),
+ DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end),
+ DiffTag::Replace => DiffOp::Replace {
+ old_start: old_range.start,
+ old_end: old_range.end,
+ new_start: new_range.start,
+ new_end: new_range.end,
+ },
}
- }
- }
-
- ops
+ })
+ .collect()
}
#[cfg(test)]
@@ -315,6 +224,24 @@ mod tests {
let tokens = tokenize(" multiple spaces ");
assert_eq!(tokens, vec![" ", "multiple", " ", "spaces", " "]);
+
+ let tokens = tokenize("self.name");
+ assert_eq!(tokens, vec!["self", ".", "name"]);
+
+ let tokens = tokenize("foo(bar, baz)");
+ assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]);
+
+ let tokens = tokenize("hello_world");
+ assert_eq!(tokens, vec!["hello_world"]);
+
+ let tokens = tokenize("fn();");
+ assert_eq!(tokens, vec!["fn", "(", ")", ";"]);
+
+ let tokens = tokenize("foo_bar123 + baz");
+ assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]);
+
+ let tokens = tokenize("print(\"hello\")");
+ assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]);
}
#[test]