From ae293de057c82b1b0c709aa44924d87821d7c7dc Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Tue, 17 Feb 2026 09:40:10 +0200 Subject: [PATCH] ep: Add metrics for inserted/deleted tokens (#49330) Other changes: - Changed tokenization to more code-aware tokenization from split-commit - Fixed word-diff implementation which was inefficient and sometimes incorrect Release Notes: - N/A --- crates/edit_prediction_cli/src/example.rs | 4 + crates/edit_prediction_cli/src/metrics.rs | 198 +++++++++++++++++ crates/edit_prediction_cli/src/score.rs | 85 ++++++- .../edit_prediction_cli/src/split_commit.rs | 66 ++---- crates/edit_prediction_cli/src/word_diff.rs | 207 ++++++------------ 5 files changed, 369 insertions(+), 191 deletions(-) diff --git a/crates/edit_prediction_cli/src/example.rs b/crates/edit_prediction_cli/src/example.rs index 5fd81afd30a6e3f9e643702361a8cf80b8b47b60..6fad9389dd6f4aa203c1ba9a13fe9e03089ee784 100644 --- a/crates/edit_prediction_cli/src/example.rs +++ b/crates/edit_prediction_cli/src/example.rs @@ -172,6 +172,10 @@ pub struct ExampleScore { pub wrong_editable_region: Option, #[serde(default)] pub has_isolated_whitespace_changes: bool, + #[serde(default)] + pub inserted_tokens: usize, + #[serde(default)] + pub deleted_tokens: usize, } impl Example { diff --git a/crates/edit_prediction_cli/src/metrics.rs b/crates/edit_prediction_cli/src/metrics.rs index 2b649bbc63e3d0f8f50afd1b79e5c97816e46dda..fc870c36c9c62f4d74486ddd4b2d35176b00bb5c 100644 --- a/crates/edit_prediction_cli/src/metrics.rs +++ b/crates/edit_prediction_cli/src/metrics.rs @@ -3,6 +3,7 @@ use collections::HashMap; use crate::{ example::ActualCursor, reorder_patch::{Patch, PatchLine}, + word_diff::{DiffOp, diff_tokens, tokenize}, }; pub type Counts = HashMap; @@ -486,6 +487,91 @@ pub fn is_editable_region_correct(actual_patch: &str) -> bool { true } +#[derive(Debug, Default, Clone)] +pub struct TokenChangeCounts { + pub inserted_tokens: usize, + pub deleted_tokens: usize, +} + +/// Counts the number of inserted and deleted tokens in a unified diff patch. +/// +/// Tokens are words and whitespace sequences (as defined by `word_diff::tokenize`). +/// Within each hunk, the old (`-`) and new (`+`) lines are compared at the token level +/// using an LCS-based diff, so modified lines only count the actually changed tokens +/// rather than the entire line. +pub fn count_patch_token_changes(patch: &str) -> TokenChangeCounts { + let mut counts = TokenChangeCounts::default(); + let mut old_lines: Vec<&str> = Vec::new(); + let mut new_lines: Vec<&str> = Vec::new(); + + let flush = + |old_lines: &mut Vec<&str>, new_lines: &mut Vec<&str>, counts: &mut TokenChangeCounts| { + if old_lines.is_empty() && new_lines.is_empty() { + return; + } + + let old_text: String = old_lines + .iter() + .map(|line| if line.len() > 1 { &line[1..] } else { "" }) + .collect::>() + .join("\n"); + + let new_text: String = new_lines + .iter() + .map(|line| if line.len() > 1 { &line[1..] } else { "" }) + .collect::>() + .join("\n"); + + let old_tokens = tokenize(&old_text); + let new_tokens = tokenize(&new_text); + let ops = diff_tokens(&old_tokens, &new_tokens); + + for op in ops { + match op { + DiffOp::Equal(..) => {} + DiffOp::Delete(start, end) => { + counts.deleted_tokens += end - start; + } + DiffOp::Insert(start, end) => { + counts.inserted_tokens += end - start; + } + DiffOp::Replace { + old_start, + old_end, + new_start, + new_end, + } => { + counts.deleted_tokens += old_end - old_start; + counts.inserted_tokens += new_end - new_start; + } + } + } + + old_lines.clear(); + new_lines.clear(); + }; + + for line in patch.lines() { + if line.starts_with("---") + || line.starts_with("+++") + || line.starts_with("@@") + || line.starts_with("diff ") + || line.starts_with("index ") + { + flush(&mut old_lines, &mut new_lines, &mut counts); + } else if line.starts_with('-') { + old_lines.push(line); + } else if line.starts_with('+') { + new_lines.push(line); + } else { + flush(&mut old_lines, &mut new_lines, &mut counts); + } + } + + flush(&mut old_lines, &mut new_lines, &mut counts); + counts +} + #[cfg(test)] mod test_optimization { use super::*; @@ -977,4 +1063,116 @@ index abc123..def456 100644 let cursor = cursor_on_line(2); assert!(has_isolated_whitespace_changes(patch, Some(&cursor))); } + + #[test] + fn test_count_patch_token_changes_real_world_rename() { + // Real-world patch that was reported as returning 0 tokens + let patch = "--- a/sip_call\\README.md\n+++ b/sip_call\\README.md\n@@ -1,1 +1,1 @@\n-# \n+# SIP Call\n"; + let counts = count_patch_token_changes(patch); + // "# " vs "# SIP Call" — the "SIP" and "Call" tokens (and a whitespace token) are inserted + assert!( + counts.inserted_tokens > 0, + "expected inserted tokens > 0, got {}", + counts.inserted_tokens + ); + assert_eq!(counts.deleted_tokens, 0); + } + + #[test] + fn test_count_patch_token_changes_real_world_expansion() { + // Real-world patch: single token expanded to multiple lines + let patch = "--- a/task1/src/app/app.html\n+++ b/task1/src/app/app.html\n@@ -1,7 +1,9 @@\n \n \n
\n \n
\n"; + let counts = count_patch_token_changes(patch); + assert!( + counts.inserted_tokens > 0, + "expected inserted tokens > 0, got {}", + counts.inserted_tokens + ); + assert!( + counts.deleted_tokens > 0, + "expected deleted tokens > 0, got {}", + counts.deleted_tokens + ); + } + + #[test] + fn test_count_patch_token_changes_simple_replacement() { + let patch = indoc! {" + @@ -1,3 +1,3 @@ + fn main() { + - println!(\"hello\"); + + println!(\"world\"); + } + "}; + let counts = count_patch_token_changes(patch); + assert_eq!(counts.deleted_tokens, 1, "deleted: \"hello\""); + assert_eq!(counts.inserted_tokens, 1, "inserted: \"world\""); + } + + #[test] + fn test_count_patch_token_changes_insertion_only() { + let patch = indoc! {" + @@ -1,2 +1,3 @@ + fn main() { + + println!(\"hello\"); + } + "}; + let counts = count_patch_token_changes(patch); + assert_eq!(counts.deleted_tokens, 0); + assert!(counts.inserted_tokens > 0); + } + + #[test] + fn test_count_patch_token_changes_deletion_only() { + let patch = indoc! {" + @@ -1,3 +1,2 @@ + fn main() { + - println!(\"hello\"); + } + "}; + let counts = count_patch_token_changes(patch); + assert!(counts.deleted_tokens > 0); + assert_eq!(counts.inserted_tokens, 0); + } + + #[test] + fn test_count_patch_token_changes_empty_patch() { + let patch = ""; + let counts = count_patch_token_changes(patch); + assert_eq!(counts.deleted_tokens, 0); + assert_eq!(counts.inserted_tokens, 0); + } + + #[test] + fn test_count_patch_token_changes_multiple_hunks() { + let patch = indoc! {" + @@ -1,3 +1,3 @@ + fn main() { + - let x = 1; + + let x = 2; + } + @@ -10,3 +10,3 @@ + fn other() { + - let y = 3; + + let y = 4; + } + "}; + let counts = count_patch_token_changes(patch); + assert_eq!(counts.deleted_tokens, 2, "deleted: \"1\" and \"3\""); + assert_eq!(counts.inserted_tokens, 2, "inserted: \"2\" and \"4\""); + } + + #[test] + fn test_count_patch_token_changes_multiword_change() { + let patch = indoc! {" + @@ -1,1 +1,1 @@ + -hello world foo + +hello bar baz + "}; + let counts = count_patch_token_changes(patch); + // "world" and "foo" deleted, "bar" and "baz" inserted + // (whitespace tokens between them may also count) + assert!(counts.deleted_tokens >= 2); + assert!(counts.inserted_tokens >= 2); + } } diff --git a/crates/edit_prediction_cli/src/score.rs b/crates/edit_prediction_cli/src/score.rs index 099b6fa1b85521a5a1b250ddfc5ef414b0eacd49..d1514f7bf93e124407c1c1557743fa16e0cd240c 100644 --- a/crates/edit_prediction_cli/src/score.rs +++ b/crates/edit_prediction_cli/src/score.rs @@ -76,6 +76,8 @@ pub async fn run_scoring( cursor_exact_match: None, wrong_editable_region: None, has_isolated_whitespace_changes: false, + inserted_tokens: 0, + deleted_tokens: 0, }; let prompt_inputs = example.prompt_inputs.as_ref().unwrap(); @@ -95,10 +97,15 @@ pub async fn run_scoring( continue; }; + let token_changes = metrics::count_patch_token_changes(&actual_patch); + let actual_text = match apply_diff_to_string(&actual_patch, original_text) { Ok(text) => text, Err(_) => { - scores.push(zero_scores.clone()); + let mut s = zero_scores.clone(); + s.inserted_tokens = token_changes.inserted_tokens; + s.deleted_tokens = token_changes.deleted_tokens; + scores.push(s); continue; } }; @@ -181,6 +188,8 @@ pub async fn run_scoring( cursor_exact_match, wrong_editable_region, has_isolated_whitespace_changes, + inserted_tokens: token_changes.inserted_tokens, + deleted_tokens: token_changes.deleted_tokens, }); } @@ -238,6 +247,9 @@ pub fn print_report(examples: &[Example]) { let mut wrong_editable_region_count: usize = 0; let mut wrong_editable_region_total: usize = 0; let mut isolated_whitespace_count: usize = 0; + let mut patch_inserted_tokens: Vec = Vec::new(); + let mut patch_deleted_tokens: Vec = Vec::new(); + let mut predictions_with_patch: usize = 0; for example in examples { for (score_idx, score) in example.score.iter().enumerate() { @@ -321,6 +333,18 @@ pub fn print_report(examples: &[Example]) { isolated_whitespace_count += 1; } + // Accumulate token change metrics (only for predictions that produced a patch) + let has_patch = example + .predictions + .get(score_idx) + .and_then(|p| p.actual_patch.as_ref()) + .is_some_and(|p| !p.is_empty()); + if has_patch { + predictions_with_patch += 1; + patch_inserted_tokens.push(score.inserted_tokens); + patch_deleted_tokens.push(score.deleted_tokens); + } + // Accumulate cursor metrics if let Some(exact_match) = score.cursor_exact_match { cursor_total += 1; @@ -421,11 +445,70 @@ pub fn print_report(examples: &[Example]) { if total_scores > 0 { println!("Isolated whitespace changes: {}", isolated_ws_str); } + + // Print token change percentile summary (only for predictions with a patch) + if !patch_inserted_tokens.is_empty() { + patch_inserted_tokens.sort_unstable(); + patch_deleted_tokens.sort_unstable(); + let mut patch_total_tokens: Vec = patch_inserted_tokens + .iter() + .zip(patch_deleted_tokens.iter()) + .map(|(i, d)| i + d) + .collect(); + patch_total_tokens.sort_unstable(); + + let patch_rate = predictions_with_patch as f32 / total_scores as f32 * 100.0; + println!(); + println!( + "Token changes ({}/{} predictions produced a patch, {:.1}% — table includes only those)", + predictions_with_patch, total_scores, patch_rate + ); + println!( + "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}", + "", "p25", "p50", "p75", "p90", "p99" + ); + println!("{}", "─".repeat(LINE_WIDTH)); + println!( + "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}", + "Inserted tokens", + percentile(&patch_inserted_tokens, 25), + percentile(&patch_inserted_tokens, 50), + percentile(&patch_inserted_tokens, 75), + percentile(&patch_inserted_tokens, 90), + percentile(&patch_inserted_tokens, 99), + ); + println!( + "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}", + "Deleted tokens", + percentile(&patch_deleted_tokens, 25), + percentile(&patch_deleted_tokens, 50), + percentile(&patch_deleted_tokens, 75), + percentile(&patch_deleted_tokens, 90), + percentile(&patch_deleted_tokens, 99), + ); + println!( + "{:<20} {:>8} {:>8} {:>8} {:>8} {:>8}", + "Total tokens", + percentile(&patch_total_tokens, 25), + percentile(&patch_total_tokens, 50), + percentile(&patch_total_tokens, 75), + percentile(&patch_total_tokens, 90), + percentile(&patch_total_tokens, 99), + ); + } } println!("\n"); } +fn percentile(sorted_values: &[usize], p: usize) -> usize { + if sorted_values.is_empty() { + return 0; + } + let idx = (p as f64 / 100.0 * (sorted_values.len() as f64 - 1.0)).round() as usize; + sorted_values[idx.min(sorted_values.len() - 1)] +} + fn truncate_name(name: &str, max_len: usize) -> String { if name.len() <= max_len { name.to_string() diff --git a/crates/edit_prediction_cli/src/split_commit.rs b/crates/edit_prediction_cli/src/split_commit.rs index b54da11c2dfa77bb531da06c581bf02a5fd615d0..08b327f1f58bebd45b30c48485286616cd152527 100644 --- a/crates/edit_prediction_cli/src/split_commit.rs +++ b/crates/edit_prediction_cli/src/split_commit.rs @@ -6,6 +6,7 @@ //! TODO: Port Python code to generate chronologically-ordered commits use crate::FailedHandling; use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line}; +use crate::word_diff::tokenize; /// Find the largest valid UTF-8 char boundary at or before `index` in `s`. fn floor_char_boundary(s: &str, index: usize) -> usize { @@ -413,37 +414,6 @@ pub fn split_ordered_commit(commit: &str, split_pos: usize) -> (String, String) (source_str, target_str) } -/// Tokenize text into words and non-word characters. -fn tokenize(text: &str) -> Vec { - let mut tokens = Vec::new(); - let mut current = String::new(); - - for ch in text.chars() { - if ch.is_alphanumeric() { - current.push(ch); - } else if ch == '_' { - // Include underscore with the current word, then flush - current.push(ch); - if !current.is_empty() { - tokens.push(std::mem::take(&mut current)); - } - } else { - // Punctuation or whitespace - flush current word first - if !current.is_empty() { - tokens.push(std::mem::take(&mut current)); - } - // Each punctuation/whitespace is its own token - tokens.push(ch.to_string()); - } - } - - if !current.is_empty() { - tokens.push(current); - } - - tokens -} - /// Calculate the weight for a split position based on the character at that position. /// /// Higher weights indicate more natural pause points (e.g., after punctuation, @@ -647,12 +617,8 @@ pub fn imitate_human_edits( let src_tokens = tokenize(&src_line); let tgt_tokens = tokenize(&tgt_line); - // Convert to slices for similar - let src_refs: Vec<&str> = src_tokens.iter().map(|s| s.as_str()).collect(); - let tgt_refs: Vec<&str> = tgt_tokens.iter().map(|s| s.as_str()).collect(); - // Use similar to get diff operations - let diff = TextDiff::from_slices(&src_refs, &tgt_refs); + let diff = TextDiff::from_slices(&src_tokens, &tgt_tokens); // Build weights for each possible split position let mut position_weights: Vec = Vec::new(); @@ -665,12 +631,12 @@ pub fn imitate_human_edits( match op.tag() { DiffTag::Equal => { for i in op.old_range() { - current_text.push_str(&src_tokens[i]); + current_text.push_str(src_tokens[i]); } } DiffTag::Replace => { - let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect(); - let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect(); + let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect(); + let del: String = op.old_range().map(|i| src_tokens[i]).collect(); // For insertion part for ch in ins.chars() { @@ -686,7 +652,7 @@ pub fn imitate_human_edits( } } DiffTag::Insert => { - let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect(); + let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect(); for ch in ins.chars() { current_text.push(ch); let weight = position_weight(¤t_text, current_text.len()); @@ -694,7 +660,7 @@ pub fn imitate_human_edits( } } DiffTag::Delete => { - let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect(); + let del: String = op.old_range().map(|i| src_tokens[i]).collect(); for _ in del.chars() { // Weight deletions lower position_weights.push(2); @@ -719,14 +685,14 @@ pub fn imitate_human_edits( match op.tag() { DiffTag::Equal => { for i in op.old_range() { - new_src.push_str(&src_tokens[i]); + new_src.push_str(src_tokens[i]); } last_old_end = op.old_range().end; } DiffTag::Replace => { // Handle replace as delete + insert - let del: String = op.old_range().map(|i| src_tokens[i].as_str()).collect(); - let ins: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect(); + let del: String = op.old_range().map(|i| src_tokens[i]).collect(); + let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect(); let repl_len = del.len() + ins.len(); if edit_index + repl_len >= split_index { // Split within this replace operation @@ -750,7 +716,7 @@ pub fn imitate_human_edits( } } DiffTag::Insert => { - let repl: String = op.new_range().map(|i| tgt_tokens[i].as_str()).collect(); + let repl: String = op.new_range().map(|i| tgt_tokens[i]).collect(); if edit_index + repl.len() >= split_index { let offset = split_index - edit_index; let safe_offset = floor_char_boundary(&repl, offset); @@ -763,7 +729,7 @@ pub fn imitate_human_edits( } } DiffTag::Delete => { - let repl: String = op.old_range().map(|i| src_tokens[i].as_str()).collect(); + let repl: String = op.old_range().map(|i| src_tokens[i]).collect(); if edit_index + repl.len() >= split_index { let offset = split_index - edit_index; let safe_offset = floor_char_boundary(&repl, offset); @@ -797,10 +763,10 @@ pub fn imitate_human_edits( // Add remainder of source if similar enough to target remainder let remainder_src: String = (last_old_end..src_tokens.len()) - .map(|i| src_tokens[i].as_str()) + .map(|i| src_tokens[i]) .collect(); let remainder_tgt: String = (last_old_end..tgt_tokens.len()) - .filter_map(|i| tgt_tokens.get(i).map(|s| s.as_str())) + .filter_map(|i| tgt_tokens.get(i).copied()) .collect(); let ratio = fuzzy_ratio(&remainder_src, &remainder_tgt); @@ -1104,13 +1070,13 @@ mod tests { assert_eq!(tokens, vec!["hello", " ", "world"]); let tokens = tokenize("foo_bar123 + baz"); - assert_eq!(tokens, vec!["foo_", "bar123", " ", "+", " ", "baz"]); + assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]); let tokens = tokenize("print(\"hello\")"); assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]); let tokens = tokenize("hello_world"); - assert_eq!(tokens, vec!["hello_", "world"]); + assert_eq!(tokens, vec!["hello_world"]); let tokens = tokenize("fn();"); assert_eq!(tokens, vec!["fn", "(", ")", ";"]); diff --git a/crates/edit_prediction_cli/src/word_diff.rs b/crates/edit_prediction_cli/src/word_diff.rs index b5db40d52e5b15e57cf3f6b92f4a1c0a0bbc13da..72026d5715d312ded5702e6571a8cd52f02185c1 100644 --- a/crates/edit_prediction_cli/src/word_diff.rs +++ b/crates/edit_prediction_cli/src/word_diff.rs @@ -1,5 +1,7 @@ //! Word-diff utilities for converting unified diffs to word-diff format. +use similar::{DiffTag, TextDiff}; + /// Convert unified diff to word-diff format. /// /// This transforms line-based diffs into word-level diffs where: @@ -129,29 +131,38 @@ fn compute_word_diff(old_text: &str, new_text: &str) -> String { result } -/// Tokenize text into words and whitespace sequences. -fn tokenize(text: &str) -> Vec<&str> { +/// Classify a character into one of three token classes: +/// - 0: identifier (alphanumeric or `_`) +/// - 1: whitespace +/// - 2: punctuation (everything else, each character becomes its own token) +fn char_class(ch: char) -> u8 { + if ch.is_alphanumeric() || ch == '_' { + 0 + } else if ch.is_whitespace() { + 1 + } else { + 2 + } +} + +/// Tokenize text into identifier words, whitespace runs, and individual punctuation characters. +/// +/// This splitting aligns with the syntactic atoms of source code so that the +/// LCS-based diff can produce fine-grained, meaningful change regions. +pub(crate) fn tokenize(text: &str) -> Vec<&str> { let mut tokens = Vec::new(); let mut chars = text.char_indices().peekable(); while let Some((start, ch)) = chars.next() { - if ch.is_whitespace() { - // Collect contiguous whitespace - let mut end = start + ch.len_utf8(); - while let Some(&(_, next_ch)) = chars.peek() { - if next_ch.is_whitespace() { - end += next_ch.len_utf8(); - chars.next(); - } else { - break; - } - } - tokens.push(&text[start..end]); + let class = char_class(ch); + if class == 2 { + // Punctuation: each character is a separate token + tokens.push(&text[start..start + ch.len_utf8()]); } else { - // Collect contiguous non-whitespace + // Identifier or whitespace: collect contiguous run of same class let mut end = start + ch.len_utf8(); while let Some(&(_, next_ch)) = chars.peek() { - if !next_ch.is_whitespace() { + if char_class(next_ch) == class { end += next_ch.len_utf8(); chars.next(); } else { @@ -166,7 +177,7 @@ fn tokenize(text: &str) -> Vec<&str> { } #[derive(Debug)] -enum DiffOp { +pub(crate) enum DiffOp { Equal(usize, usize), Delete(usize, usize), Insert(usize, usize), @@ -178,130 +189,28 @@ enum DiffOp { }, } -/// Compute diff operations between two token sequences using a simple LCS-based algorithm. -fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec { - // Build LCS table - let m = old.len(); - let n = new.len(); - - if m == 0 && n == 0 { - return vec![]; - } - if m == 0 { - return vec![DiffOp::Insert(0, n)]; - } - if n == 0 { - return vec![DiffOp::Delete(0, m)]; - } - - // LCS dynamic programming - let mut dp = vec![vec![0usize; n + 1]; m + 1]; - for i in 1..=m { - for j in 1..=n { - if old[i - 1] == new[j - 1] { - dp[i][j] = dp[i - 1][j - 1] + 1; - } else { - dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]); - } - } - } - - // Backtrack to find operations - let mut ops = Vec::new(); - let mut i = m; - let mut j = n; - - // We'll collect in reverse order, then reverse at the end - let mut stack: Vec<(usize, usize, bool)> = Vec::new(); // (index, end, is_old) - - while i > 0 || j > 0 { - if i > 0 && j > 0 && old[i - 1] == new[j - 1] { - stack.push((i - 1, i, true)); // Equal marker (using old index) - stack.push((j - 1, j, false)); // Paired with new index - i -= 1; - j -= 1; - } else if j > 0 && (i == 0 || dp[i][j - 1] >= dp[i - 1][j]) { - // Insert from new - stack.push((j - 1, j, false)); - j -= 1; - } else { - // Delete from old - stack.push((i - 1, i, true)); - i -= 1; - } - } - - // Process the stack to build proper DiffOps - // This is a simplified approach - just iterate through and build ops - let mut old_idx = 0; - let mut new_idx = 0; - - while old_idx < m || new_idx < n { - // Find next matching pair - let mut old_match = None; - let mut new_match = None; - - for oi in old_idx..m { - for ni in new_idx..n { - if old[oi] == new[ni] { - old_match = Some(oi); - new_match = Some(ni); - break; - } - } - if old_match.is_some() { - break; - } - } - - match (old_match, new_match) { - (Some(om), Some(nm)) => { - // Handle any deletions/insertions before the match - if old_idx < om && new_idx < nm { - ops.push(DiffOp::Replace { - old_start: old_idx, - old_end: om, - new_start: new_idx, - new_end: nm, - }); - } else if old_idx < om { - ops.push(DiffOp::Delete(old_idx, om)); - } else if new_idx < nm { - ops.push(DiffOp::Insert(new_idx, nm)); - } - - // Find the extent of the equal sequence - let mut eq_end_old = om; - let mut eq_end_new = nm; - while eq_end_old < m && eq_end_new < n && old[eq_end_old] == new[eq_end_new] { - eq_end_old += 1; - eq_end_new += 1; - } - - ops.push(DiffOp::Equal(om, eq_end_old)); - old_idx = eq_end_old; - new_idx = eq_end_new; - } - _ => { - // No more matches, handle remaining - if old_idx < m && new_idx < n { - ops.push(DiffOp::Replace { - old_start: old_idx, - old_end: m, - new_start: new_idx, - new_end: n, - }); - } else if old_idx < m { - ops.push(DiffOp::Delete(old_idx, m)); - } else if new_idx < n { - ops.push(DiffOp::Insert(new_idx, n)); - } - break; +/// Compute diff operations between two token sequences using `similar`'s Myers diff. +pub(crate) fn diff_tokens<'a>(old: &[&'a str], new: &[&'a str]) -> Vec { + let diff = TextDiff::from_slices(old, new); + diff.ops() + .iter() + .map(|op| { + let tag = op.tag(); + let old_range = op.old_range(); + let new_range = op.new_range(); + match tag { + DiffTag::Equal => DiffOp::Equal(old_range.start, old_range.end), + DiffTag::Delete => DiffOp::Delete(old_range.start, old_range.end), + DiffTag::Insert => DiffOp::Insert(new_range.start, new_range.end), + DiffTag::Replace => DiffOp::Replace { + old_start: old_range.start, + old_end: old_range.end, + new_start: new_range.start, + new_end: new_range.end, + }, } - } - } - - ops + }) + .collect() } #[cfg(test)] @@ -315,6 +224,24 @@ mod tests { let tokens = tokenize(" multiple spaces "); assert_eq!(tokens, vec![" ", "multiple", " ", "spaces", " "]); + + let tokens = tokenize("self.name"); + assert_eq!(tokens, vec!["self", ".", "name"]); + + let tokens = tokenize("foo(bar, baz)"); + assert_eq!(tokens, vec!["foo", "(", "bar", ",", " ", "baz", ")"]); + + let tokens = tokenize("hello_world"); + assert_eq!(tokens, vec!["hello_world"]); + + let tokens = tokenize("fn();"); + assert_eq!(tokens, vec!["fn", "(", ")", ";"]); + + let tokens = tokenize("foo_bar123 + baz"); + assert_eq!(tokens, vec!["foo_bar123", " ", "+", " ", "baz"]); + + let tokens = tokenize("print(\"hello\")"); + assert_eq!(tokens, vec!["print", "(", "\"", "hello", "\"", ")"]); } #[test]