From 1252513037a8a40ca510858bed2aa9b085b1e306 Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Wed, 6 May 2026 20:03:01 +0300 Subject: [PATCH] ep: Drop dependency on language/gpui (#55917) Release Notes: - N/A Co-authored-by: Ben Kunkle --- Cargo.lock | 2 +- crates/edit_prediction_metrics/Cargo.toml | 2 +- .../edit_prediction_metrics/src/reversal.rs | 153 +++++++++++++++++- 3 files changed, 151 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f23f4064eae79b77dcc386c91c1d53b437129d51..d9a42436e3d6c40e92716fe7adfd4534aa4ce395 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5396,8 +5396,8 @@ dependencies = [ name = "edit_prediction_metrics" version = "0.1.0" dependencies = [ + "imara-diff", "indoc", - "language", "pretty_assertions", "serde", "serde_json", diff --git a/crates/edit_prediction_metrics/Cargo.toml b/crates/edit_prediction_metrics/Cargo.toml index 62184f2f6f93da035c3b2196620dd568653df746..ba4990d9381a8c912873d056fe8044d44bf4bbc6 100644 --- a/crates/edit_prediction_metrics/Cargo.toml +++ b/crates/edit_prediction_metrics/Cargo.toml @@ -12,7 +12,7 @@ workspace = true path = "src/edit_prediction_metrics.rs" [dependencies] -language.workspace = true +imara-diff.workspace = true serde.workspace = true serde_json = "1.0" similar = "2.7.0" diff --git a/crates/edit_prediction_metrics/src/reversal.rs b/crates/edit_prediction_metrics/src/reversal.rs index d6263d84c3fce53888ac96689c4c64513bb96c80..fb84f57aa4e0d9fa4fad09c61b0aa9b1056b8b9d 100644 --- a/crates/edit_prediction_metrics/src/reversal.rs +++ b/crates/edit_prediction_metrics/src/reversal.rs @@ -1,10 +1,155 @@ +use std::iter; use std::ops::Range; use std::path::Path; use std::sync::Arc; -use language::{char_diff, text_diff}; +use crate::tokenize::tokenize; +use imara_diff::{ + Algorithm, diff, + intern::{InternedInput, Token}, + sources::lines_with_terminator, +}; use zeta_prompt::udiff::apply_diff_to_string; +fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range, Arc)> { + let empty: Arc = Arc::default(); + let mut edits = Vec::new(); + let mut hunk_input = InternedInput::default(); + let input = InternedInput::new( + lines_with_terminator(old_text), + lines_with_terminator(new_text), + ); + + diff_internal(&input, &mut |old_byte_range, + new_byte_range, + old_rows, + new_rows| { + if should_perform_token_diff_within_hunk( + &old_byte_range, + &new_byte_range, + &old_rows, + &new_rows, + ) { + let old_offset = old_byte_range.start; + let new_offset = new_byte_range.start; + hunk_input.clear(); + hunk_input.update_before(tokenize(&old_text[old_byte_range]).into_iter()); + hunk_input.update_after(tokenize(&new_text[new_byte_range]).into_iter()); + diff_internal(&hunk_input, &mut |old_byte_range, new_byte_range, _, _| { + let old_byte_range = + old_offset + old_byte_range.start..old_offset + old_byte_range.end; + let new_byte_range = + new_offset + new_byte_range.start..new_offset + new_byte_range.end; + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range].into() + }; + edits.push((old_byte_range, replacement_text)); + }); + } else { + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range].into() + }; + edits.push((old_byte_range, replacement_text)); + } + }); + + edits +} + +fn char_diff<'a>(old_text: &'a str, new_text: &'a str) -> Vec<(Range, &'a str)> { + let mut input: InternedInput<&str> = InternedInput::default(); + input.update_before(tokenize_chars(old_text)); + input.update_after(tokenize_chars(new_text)); + let mut edits = Vec::new(); + + diff_internal(&input, &mut |old_byte_range, new_byte_range, _, _| { + let replacement = if new_byte_range.is_empty() { + "" + } else { + &new_text[new_byte_range] + }; + edits.push((old_byte_range, replacement)); + }); + + edits +} + +fn should_perform_token_diff_within_hunk( + old_byte_range: &Range, + new_byte_range: &Range, + old_row_range: &Range, + new_row_range: &Range, +) -> bool { + const MAX_TOKEN_DIFF_LEN: usize = 512; + const MAX_TOKEN_DIFF_LINE_COUNT: usize = 8; + + !old_byte_range.is_empty() + && !new_byte_range.is_empty() + && old_byte_range.len() <= MAX_TOKEN_DIFF_LEN + && new_byte_range.len() <= MAX_TOKEN_DIFF_LEN + && old_row_range.len() <= MAX_TOKEN_DIFF_LINE_COUNT + && new_row_range.len() <= MAX_TOKEN_DIFF_LINE_COUNT +} + +fn diff_internal( + input: &InternedInput<&str>, + on_change: &mut dyn FnMut(Range, Range, Range, Range), +) { + let mut old_offset = 0; + let mut new_offset = 0; + let mut old_token_ix = 0; + let mut new_token_ix = 0; + + diff( + Algorithm::Histogram, + input, + |old_tokens: Range, new_tokens: Range| { + old_offset += token_len( + input, + &input.before[old_token_ix as usize..old_tokens.start as usize], + ); + new_offset += token_len( + input, + &input.after[new_token_ix as usize..new_tokens.start as usize], + ); + let old_len = token_len( + input, + &input.before[old_tokens.start as usize..old_tokens.end as usize], + ); + let new_len = token_len( + input, + &input.after[new_tokens.start as usize..new_tokens.end as usize], + ); + let old_byte_range = old_offset..old_offset + old_len; + let new_byte_range = new_offset..new_offset + new_len; + old_token_ix = old_tokens.end; + new_token_ix = new_tokens.end; + old_offset = old_byte_range.end; + new_offset = new_byte_range.end; + on_change(old_byte_range, new_byte_range, old_tokens, new_tokens); + }, + ); +} + +fn tokenize_chars(text: &str) -> impl Iterator { + let mut chars = text.char_indices(); + iter::from_fn(move || { + let (start, character) = chars.next()?; + Some(&text[start..start + character.len_utf8()]) + }) +} + +fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize { + tokens + .iter() + .map(|token| input.interner[*token].len()) + .sum() +} + fn apply_diff_to_string_lenient(diff_str: &str, text: &str) -> String { let hunks = parse_diff_hunks(diff_str); let mut result = text.to_string(); @@ -651,7 +796,7 @@ pub fn compute_prediction_reversal_ratio_from_history( mod tests { use super::*; use indoc::indoc; - use zeta_prompt::udiff::apply_diff_to_string; + use zeta_prompt::udiff::{apply_diff_to_string, unified_diff_with_context}; use zeta_prompt::{ExcerptRanges, ZetaPromptInput}; fn compute_prediction_reversal_ratio( @@ -1008,8 +1153,8 @@ mod tests { last line "}; - // unified_diff doesn't include file headers, but apply_diff_to_string needs them - let diff_body = language::unified_diff(original, modified); + // unified_diff_with_context doesn't include file headers, but apply_diff_to_string needs them + let diff_body = unified_diff_with_context(original, modified, 0, 0, 3); let forward_diff = format!("--- a/file\n+++ b/file\n{}", diff_body); let reversed_diff = reverse_diff(&forward_diff);