From 0fdad0c0d6216618b59e269a04fbe81e5b758bfa Mon Sep 17 00:00:00 2001 From: Max Brunsfeld Date: Wed, 19 Feb 2025 16:56:01 -0800 Subject: [PATCH] Use line-based and word-based diff when reloading and formatting buffers (#25129) Closes https://github.com/zed-industries/zed/issues/10122 Closes https://github.com/zed-industries/zed/issues/25034 When formatting buffers or reloading them after they change on disk, we performed a diff between the buffer's current contents and the new content. We need this diff in order preserve the positions of cursors and other decorations when updating the buffer's text. In order to handle changes within lines, we would previously compute a *character-wise* diff. This was extremely expensive for large files. This PR gets rid of the character-wise diff, and instead performs a normal line-wise diff. Then, for certain replace hunks, we compute a secondary word-based diff. Also, I've switched to the [`imara-diff`](https://github.com/pascalkuthe/imara-diff) crate, instead of `similar`. Release Notes: - Fixed a hang that could occur when large files were changed on disk or formatted. --- Cargo.lock | 22 +- Cargo.toml | 2 +- crates/assistant/Cargo.toml | 1 - crates/assistant/src/inline_assistant.rs | 63 ++---- crates/assistant2/Cargo.toml | 6 +- crates/assistant2/src/buffer_codegen.rs | 63 ++---- crates/editor/Cargo.toml | 1 - crates/editor/src/editor.rs | 63 ++---- crates/editor/src/element.rs | 3 +- crates/language/Cargo.toml | 2 +- crates/language/src/buffer.rs | 58 +---- crates/language/src/buffer_tests.rs | 37 ++- crates/language/src/language.rs | 2 + crates/language/src/text_diff.rs | 274 +++++++++++++++++++++++ crates/project/Cargo.toml | 1 - crates/project/src/lsp_store.rs | 58 ++--- crates/project/src/project_tests.rs | 30 ++- crates/zeta/Cargo.toml | 1 - crates/zeta/src/zeta.rs | 77 +------ 19 files changed, 425 insertions(+), 339 deletions(-) create mode 100644 crates/language/src/text_diff.rs diff --git a/Cargo.lock b/Cargo.lock index 6e674784adf89e3d5fc1a565a60489272660703a..412008d1612a167c30a4547ba39baac7543f7701 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -416,7 +416,6 @@ dependencies = [ "serde", "serde_json_lenient", "settings", - "similar", "smol", "streaming_diff", "telemetry", @@ -482,7 +481,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smol", "streaming_diff", "telemetry_events", @@ -4064,7 +4062,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smallvec", "smol", "snippet", @@ -6376,6 +6373,15 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edcd27d72f2f071c64249075f42e205ff93c9a4c5f6c6da53e79ed9f9832c285" +[[package]] +name = "imara-diff" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17d34b7d42178945f775e84bc4c36dde7c1c6cdfea656d3354d009056f2bb3d2" +dependencies = [ + "hashbrown 0.15.2", +] + [[package]] name = "imgref" version = "1.11.0" @@ -6868,6 +6874,7 @@ dependencies = [ "globset", "gpui", "http_client", + "imara-diff", "indoc", "itertools 0.14.0", "log", @@ -6882,7 +6889,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "smallvec", "smol", "streaming-iterator", @@ -10129,7 +10135,6 @@ dependencies = [ "sha2", "shellexpand 2.1.2", "shlex", - "similar", "smol", "snippet", "snippet_provider", @@ -12270,12 +12275,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" -[[package]] -name = "similar" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad1d488a557b235fc46dae55512ffbfc429d2482b08b4d9435ab07384ca8aec" - [[package]] name = "simple_asn1" version = "0.6.2" @@ -17140,7 +17139,6 @@ dependencies = [ "serde", "serde_json", "settings", - "similar", "telemetry", "telemetry_events", "theme", diff --git a/Cargo.toml b/Cargo.toml index 4b787d5e75a8529eb58c1387c9adb042dd834341..c93c56486d6adeb2b73b2d5feb0fbf61a6645ccc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -421,6 +421,7 @@ hyper = "0.14" http = "1.1" ignore = "0.4.22" image = "0.25.1" +imara-diff = "0.1.8" indexmap = { version = "2.7.0", features = ["serde"] } indoc = "2" inventory = "0.3.19" @@ -499,7 +500,6 @@ sha2 = "0.10" shellexpand = "2.1.0" shlex = "1.3.0" signal-hook = "0.3.17" -similar = "1.3" simplelog = "0.12.2" smallvec = { version = "1.6", features = ["union"] } smol = "2.0" diff --git a/crates/assistant/Cargo.toml b/crates/assistant/Cargo.toml index 2ad0e7e0820a1034c282eef715733be5c761aae6..1bb7cbc1ae116395ba1e44d4119578dc443a16b2 100644 --- a/crates/assistant/Cargo.toml +++ b/crates/assistant/Cargo.toml @@ -59,7 +59,6 @@ search.workspace = true semantic_index.workspace = true serde.workspace = true settings.workspace = true -similar.workspace = true smol.workspace = true streaming_diff.workspace = true telemetry.workspace = true diff --git a/crates/assistant/src/inline_assistant.rs b/crates/assistant/src/inline_assistant.rs index 554db126457c77c33adf03959f429105faaeb0e2..cb4ada5f08896e498559fca04ba7fea5d0a7ef2f 100644 --- a/crates/assistant/src/inline_assistant.rs +++ b/crates/assistant/src/inline_assistant.rs @@ -30,7 +30,7 @@ use gpui::{ EventEmitter, FocusHandle, Focusable, FontWeight, Global, HighlightStyle, Subscription, Task, TextStyle, UpdateGlobal, WeakEntity, Window, }; -use language::{Buffer, IndentKind, Point, Selection, TransactionId}; +use language::{line_diff, Buffer, IndentKind, Point, Selection, TransactionId}; use language_model::{ LanguageModel, LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage, LanguageModelTextStream, Role, @@ -3350,52 +3350,29 @@ impl CodegenAlternative { ) .collect::(); - let mut old_row = old_range.start.row; - let mut new_row = new_range.start.row; - let batch_diff = - similar::TextDiff::from_lines(old_text.as_str(), new_text.as_str()); - + let old_start_row = old_range.start.row; + let new_start_row = new_range.start.row; let mut deleted_row_ranges: Vec<(Anchor, RangeInclusive)> = Vec::new(); let mut inserted_row_ranges = Vec::new(); - for change in batch_diff.iter_all_changes() { - let line_count = change.value().lines().count() as u32; - match change.tag() { - similar::ChangeTag::Equal => { - old_row += line_count; - new_row += line_count; - } - similar::ChangeTag::Delete => { - let old_end_row = old_row + line_count - 1; - let new_row = new_snapshot.anchor_before(Point::new(new_row, 0)); - - if let Some((_, last_deleted_row_range)) = - deleted_row_ranges.last_mut() - { - if *last_deleted_row_range.end() + 1 == old_row { - *last_deleted_row_range = - *last_deleted_row_range.start()..=old_end_row; - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - - old_row += line_count; - } - similar::ChangeTag::Insert => { - let new_end_row = new_row + line_count - 1; - let start = new_snapshot.anchor_before(Point::new(new_row, 0)); - let end = new_snapshot.anchor_before(Point::new( - new_end_row, - new_snapshot.line_len(MultiBufferRow(new_end_row)), - )); - inserted_row_ranges.push(start..end); - new_row += line_count; - } + for (old_rows, new_rows) in line_diff(&old_text, &new_text) { + let old_rows = old_start_row + old_rows.start..old_start_row + old_rows.end; + let new_rows = new_start_row + new_rows.start..new_start_row + new_rows.end; + if !old_rows.is_empty() { + deleted_row_ranges.push(( + new_snapshot.anchor_before(Point::new(new_rows.start, 0)), + old_rows.start..=old_rows.end - 1, + )); + } + if !new_rows.is_empty() { + let start = new_snapshot.anchor_before(Point::new(new_rows.start, 0)); + let new_end_row = new_rows.end - 1; + let end = new_snapshot.anchor_before(Point::new( + new_end_row, + new_snapshot.line_len(MultiBufferRow(new_end_row)), + )); + inserted_row_ranges.push(start..end); } } - (deleted_row_ranges, inserted_row_ranges) }) .await; diff --git a/crates/assistant2/Cargo.toml b/crates/assistant2/Cargo.toml index f53595cd0c067f6060817e300fe7eff0dba13d0d..9a74a5e2fec07bc54c76a7671f5a72315fb02af4 100644 --- a/crates/assistant2/Cargo.toml +++ b/crates/assistant2/Cargo.toml @@ -62,7 +62,6 @@ rope.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smol.workspace = true streaming_diff.workspace = true telemetry_events.workspace = true @@ -79,5 +78,10 @@ workspace.workspace = true zed_actions.workspace = true [dev-dependencies] +editor = { workspace = true, features = ["test-support"] } +gpui = { workspace = true, "features" = ["test-support"] } +language = { workspace = true, "features" = ["test-support"] } +language_model = { workspace = true, "features" = ["test-support"] } +project = { workspace = true, features = ["test-support"] } rand.workspace = true indoc.workspace = true diff --git a/crates/assistant2/src/buffer_codegen.rs b/crates/assistant2/src/buffer_codegen.rs index 9c36cc69dcb829b8d3f6c3f3f338020f78c92e00..4e62f9549dc820f1af70e3b626909fed36488676 100644 --- a/crates/assistant2/src/buffer_codegen.rs +++ b/crates/assistant2/src/buffer_codegen.rs @@ -7,7 +7,7 @@ use collections::HashSet; use editor::{Anchor, AnchorRangeExt, MultiBuffer, MultiBufferSnapshot, ToOffset as _, ToPoint}; use futures::{channel::mpsc, future::LocalBoxFuture, join, SinkExt, Stream, StreamExt}; use gpui::{App, AppContext as _, Context, Entity, EventEmitter, Subscription, Task}; -use language::{Buffer, IndentKind, Point, TransactionId}; +use language::{line_diff, Buffer, IndentKind, Point, TransactionId}; use language_model::{ LanguageModel, LanguageModelRegistry, LanguageModelRequest, LanguageModelRequestMessage, LanguageModelTextStream, Role, @@ -827,52 +827,29 @@ impl CodegenAlternative { ) .collect::(); - let mut old_row = old_range.start.row; - let mut new_row = new_range.start.row; - let batch_diff = - similar::TextDiff::from_lines(old_text.as_str(), new_text.as_str()); - + let old_start_row = old_range.start.row; + let new_start_row = new_range.start.row; let mut deleted_row_ranges: Vec<(Anchor, RangeInclusive)> = Vec::new(); let mut inserted_row_ranges = Vec::new(); - for change in batch_diff.iter_all_changes() { - let line_count = change.value().lines().count() as u32; - match change.tag() { - similar::ChangeTag::Equal => { - old_row += line_count; - new_row += line_count; - } - similar::ChangeTag::Delete => { - let old_end_row = old_row + line_count - 1; - let new_row = new_snapshot.anchor_before(Point::new(new_row, 0)); - - if let Some((_, last_deleted_row_range)) = - deleted_row_ranges.last_mut() - { - if *last_deleted_row_range.end() + 1 == old_row { - *last_deleted_row_range = - *last_deleted_row_range.start()..=old_end_row; - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - } else { - deleted_row_ranges.push((new_row, old_row..=old_end_row)); - } - - old_row += line_count; - } - similar::ChangeTag::Insert => { - let new_end_row = new_row + line_count - 1; - let start = new_snapshot.anchor_before(Point::new(new_row, 0)); - let end = new_snapshot.anchor_before(Point::new( - new_end_row, - new_snapshot.line_len(MultiBufferRow(new_end_row)), - )); - inserted_row_ranges.push(start..end); - new_row += line_count; - } + for (old_rows, new_rows) in line_diff(&old_text, &new_text) { + let old_rows = old_start_row + old_rows.start..old_start_row + old_rows.end; + let new_rows = new_start_row + new_rows.start..new_start_row + new_rows.end; + if !old_rows.is_empty() { + deleted_row_ranges.push(( + new_snapshot.anchor_before(Point::new(new_rows.start, 0)), + old_rows.start..=old_rows.end - 1, + )); + } + if !new_rows.is_empty() { + let start = new_snapshot.anchor_before(Point::new(new_rows.start, 0)); + let new_end_row = new_rows.end - 1; + let end = new_snapshot.anchor_before(Point::new( + new_end_row, + new_snapshot.line_len(MultiBufferRow(new_end_row)), + )); + inserted_row_ranges.push(start..end); } } - (deleted_row_ranges, inserted_row_ranges) }) .await; diff --git a/crates/editor/Cargo.toml b/crates/editor/Cargo.toml index abc8f0b6f0fc15c19d6bf71ea650d1bd11f87d0c..08d626c25fa7f6c8a45ad17105e2cf776f54f4f4 100644 --- a/crates/editor/Cargo.toml +++ b/crates/editor/Cargo.toml @@ -66,7 +66,6 @@ schemars.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smallvec.workspace = true smol.workspace = true snippet.workspace = true diff --git a/crates/editor/src/editor.rs b/crates/editor/src/editor.rs index 1afd49fba20ff7b16c9b5e64cec8ec4e25b0855f..e30b369bd34f9ed90b278fb627eedb4d1c63468d 100644 --- a/crates/editor/src/editor.rs +++ b/crates/editor/src/editor.rs @@ -100,10 +100,10 @@ use language::{ language_settings::{ self, all_language_settings, language_settings, InlayHintSettings, RewrapBehavior, }, - point_from_lsp, AutoindentMode, BracketPair, Buffer, Capability, CharKind, CodeLabel, - CursorShape, Diagnostic, DiskState, EditPredictionsMode, EditPreview, HighlightedText, - IndentKind, IndentSize, Language, OffsetRangeExt, Point, Selection, SelectionGoal, TextObject, - TransactionId, TreeSitterOptions, + point_from_lsp, text_diff_with_options, AutoindentMode, BracketPair, Buffer, Capability, + CharKind, CodeLabel, CursorShape, Diagnostic, DiffOptions, DiskState, EditPredictionsMode, + EditPreview, HighlightedText, IndentKind, IndentSize, Language, OffsetRangeExt, Point, + Selection, SelectionGoal, TextObject, TransactionId, TreeSitterOptions, }; use language::{point_to_lsp, BufferRow, CharClassifier, Runnable, RunnableRange}; use linked_editing_ranges::refresh_linked_ranges; @@ -112,7 +112,6 @@ use persistence::DB; pub use proposed_changes_editor::{ ProposedChangeLocation, ProposedChangesEditor, ProposedChangesEditorToolbar, }; -use similar::{ChangeTag, TextDiff}; use std::iter::Peekable; use task::{ResolvedTask, TaskTemplate, TaskVariables}; @@ -202,7 +201,7 @@ pub(crate) const CURSORS_VISIBLE_FOR: Duration = Duration::from_millis(2000); #[doc(hidden)] pub const CODE_ACTIONS_DEBOUNCE_TIMEOUT: Duration = Duration::from_millis(250); -pub(crate) const FORMAT_TIMEOUT: Duration = Duration::from_secs(2); +pub(crate) const FORMAT_TIMEOUT: Duration = Duration::from_secs(5); pub(crate) const SCROLL_CENTER_TOP_BOTTOM_DEBOUNCE_TIMEOUT: Duration = Duration::from_secs(1); pub(crate) const EDIT_PREDICTION_KEY_CONTEXT: &str = "edit_prediction"; @@ -7829,6 +7828,7 @@ impl Editor { } let start = Point::new(start_row, 0); + let start_offset = start.to_offset(&buffer); let end = Point::new(end_row, buffer.line_len(MultiBufferRow(end_row))); let selection_text = buffer.text_for_range(start..end).collect::(); let Some(lines_without_prefixes) = selection_text @@ -7858,44 +7858,21 @@ impl Editor { // TODO: should always use char-based diff while still supporting cursor behavior that // matches vim. - let diff = match is_vim_mode { - IsVimMode::Yes => TextDiff::from_lines(&selection_text, &wrapped_text), - IsVimMode::No => TextDiff::from_chars(&selection_text, &wrapped_text), - }; - let mut offset = start.to_offset(&buffer); - let mut moved_since_edit = true; - - for change in diff.iter_all_changes() { - let value = change.value(); - match change.tag() { - ChangeTag::Equal => { - offset += value.len(); - moved_since_edit = true; - } - ChangeTag::Delete => { - let start = buffer.anchor_after(offset); - let end = buffer.anchor_before(offset + value.len()); - - if moved_since_edit { - edits.push((start..end, String::new())); - } else { - edits.last_mut().unwrap().0.end = end; - } - - offset += value.len(); - moved_since_edit = false; - } - ChangeTag::Insert => { - if moved_since_edit { - let anchor = buffer.anchor_after(offset); - edits.push((anchor..anchor, value.to_string())); - } else { - edits.last_mut().unwrap().1.push_str(value); - } + let mut diff_options = DiffOptions::default(); + if is_vim_mode == IsVimMode::Yes { + diff_options.max_word_diff_len = 0; + diff_options.max_word_diff_line_count = 0; + } else { + diff_options.max_word_diff_len = usize::MAX; + diff_options.max_word_diff_line_count = usize::MAX; + } - moved_since_edit = false; - } - } + for (old_range, new_text) in + text_diff_with_options(&selection_text, &wrapped_text, diff_options) + { + let edit_start = buffer.anchor_after(start_offset + old_range.start); + let edit_end = buffer.anchor_after(start_offset + old_range.end); + edits.push((edit_start..edit_end, new_text)); } rewrapped_row_ranges.push(start_row..=end_row); diff --git a/crates/editor/src/element.rs b/crates/editor/src/element.rs index 93d60624e2d2d8468b7721ebd713ee386d77f10c..ec99900ba8d1faf198b235bdb6d9ae144052d5c6 100644 --- a/crates/editor/src/element.rs +++ b/crates/editor/src/element.rs @@ -8409,7 +8409,6 @@ mod tests { use gpui::{TestAppContext, VisualTestContext}; use language::language_settings; use log::info; - use similar::DiffableStr; use std::num::NonZeroU32; use util::test::sample_text; @@ -8709,7 +8708,7 @@ mod tests { state .line_numbers .get(&MultiBufferRow(0)) - .and_then(|line_number| line_number.shaped_line.text.as_str()), + .map(|line_number| line_number.shaped_line.text.as_ref()), Some("1") ); } diff --git a/crates/language/Cargo.toml b/crates/language/Cargo.toml index 9e5fd55c967375702e8fe2ea3a2fe6103a113007..c2e4c354540f6f6bca87d8e50c0ee202f2740e9e 100644 --- a/crates/language/Cargo.toml +++ b/crates/language/Cargo.toml @@ -37,6 +37,7 @@ fuzzy.workspace = true globset.workspace = true gpui.workspace = true http_client.workspace = true +imara-diff.workspace = true itertools.workspace = true log.workspace = true lsp.workspace = true @@ -49,7 +50,6 @@ schemars.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true smallvec.workspace = true smol.workspace = true streaming-iterator.workspace = true diff --git a/crates/language/src/buffer.rs b/crates/language/src/buffer.rs index c23eeae533389c226421ccc23055abc18ee70b35..192519367ba1c7a317c5ccba3dea62cd97942b92 100644 --- a/crates/language/src/buffer.rs +++ b/crates/language/src/buffer.rs @@ -12,6 +12,7 @@ use crate::{ SyntaxMapMatches, SyntaxSnapshot, ToTreeSitterPoint, }, task_context::RunnableRange, + text_diff::text_diff, LanguageScope, Outline, OutlineConfig, RunnableCapture, RunnableTag, TextObject, TreeSitterOptions, }; @@ -32,7 +33,6 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_json::Value; use settings::WorktreeId; -use similar::{ChangeTag, TextDiff}; use smallvec::SmallVec; use smol::future::yield_now; use std::{ @@ -1792,61 +1792,7 @@ impl Buffer { let old_text = old_text.to_string(); let line_ending = LineEnding::detect(&new_text); LineEnding::normalize(&mut new_text); - - let diff = TextDiff::from_chars(old_text.as_str(), new_text.as_str()); - let empty: Arc = Arc::default(); - - let mut edits = Vec::new(); - let mut old_offset = 0; - let mut new_offset = 0; - let mut last_edit: Option<(Range, Range)> = None; - for change in diff.iter_all_changes().map(Some).chain([None]) { - if let Some(change) = &change { - let len = change.value().len(); - match change.tag() { - ChangeTag::Equal => { - old_offset += len; - new_offset += len; - } - ChangeTag::Delete => { - let old_end_offset = old_offset + len; - if let Some((last_old_range, _)) = &mut last_edit { - last_old_range.end = old_end_offset; - } else { - last_edit = - Some((old_offset..old_end_offset, new_offset..new_offset)); - } - old_offset = old_end_offset; - } - ChangeTag::Insert => { - let new_end_offset = new_offset + len; - if let Some((_, last_new_range)) = &mut last_edit { - last_new_range.end = new_end_offset; - } else { - last_edit = - Some((old_offset..old_offset, new_offset..new_end_offset)); - } - new_offset = new_end_offset; - } - } - } - - if let Some((old_range, new_range)) = &last_edit { - if old_offset > old_range.end - || new_offset > new_range.end - || change.is_none() - { - let text = if new_range.is_empty() { - empty.clone() - } else { - new_text[new_range.clone()].into() - }; - edits.push((old_range.clone(), text)); - last_edit.take(); - } - } - } - + let edits = text_diff(&old_text, &new_text); Diff { base_version, line_ending, diff --git a/crates/language/src/buffer_tests.rs b/crates/language/src/buffer_tests.rs index 84aef374ca5e87ea3346f3e1aabc6f7fe54d37b4..1b3b39dcc42719a6ea96a877a9b15fcb1c27c589 100644 --- a/crates/language/src/buffer_tests.rs +++ b/crates/language/src/buffer_tests.rs @@ -25,6 +25,7 @@ use text::{BufferId, LineEnding}; use text::{Point, ToPoint}; use theme::ActiveTheme; use unindent::Unindent as _; +use util::test::marked_text_offsets; use util::{assert_set_eq, post_inc, test::marked_text_ranges, RandomCharIter}; pub static TRAILING_WHITESPACE_REGEX: LazyLock = LazyLock::new(|| { @@ -354,24 +355,44 @@ fn test_edit_events(cx: &mut gpui::App) { #[gpui::test] async fn test_apply_diff(cx: &mut TestAppContext) { - let text = "a\nbb\nccc\ndddd\neeeee\nffffff\n"; + let (text, offsets) = marked_text_offsets( + "one two three\nfour fiˇve six\nseven eightˇ nine\nten eleven twelve\n", + ); let buffer = cx.new(|cx| Buffer::local(text, cx)); - let anchor = buffer.update(cx, |buffer, _| buffer.anchor_before(Point::new(3, 3))); + let anchors = buffer.update(cx, |buffer, _| { + offsets + .iter() + .map(|offset| buffer.anchor_before(offset)) + .collect::>() + }); + + let (text, offsets) = marked_text_offsets( + "one two three\n{\nfour FIVEˇ six\n}\nseven AND EIGHTˇ nine\nten eleven twelve\n", + ); - let text = "a\nccc\ndddd\nffffff\n"; - let diff = buffer.update(cx, |b, cx| b.diff(text.into(), cx)).await; + let diff = buffer.update(cx, |b, cx| b.diff(text.clone(), cx)).await; buffer.update(cx, |buffer, cx| { buffer.apply_diff(diff, cx).unwrap(); assert_eq!(buffer.text(), text); - assert_eq!(anchor.to_point(buffer), Point::new(2, 3)); + let actual_offsets = anchors + .iter() + .map(|anchor| anchor.to_offset(buffer)) + .collect::>(); + assert_eq!(actual_offsets, offsets); }); - let text = "a\n1\n\nccc\ndd2dd\nffffff\n"; - let diff = buffer.update(cx, |b, cx| b.diff(text.into(), cx)).await; + let (text, offsets) = + marked_text_offsets("one two three\n{\nˇ}\nseven AND EIGHTEENˇ nine\nten eleven twelve\n"); + + let diff = buffer.update(cx, |b, cx| b.diff(text.clone(), cx)).await; buffer.update(cx, |buffer, cx| { buffer.apply_diff(diff, cx).unwrap(); assert_eq!(buffer.text(), text); - assert_eq!(anchor.to_point(buffer), Point::new(4, 4)); + let actual_offsets = anchors + .iter() + .map(|anchor| anchor.to_offset(buffer)) + .collect::>(); + assert_eq!(actual_offsets, offsets); }); } diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index b6c5bf6225616db7d8ff89e31f8e67f9a662b5f3..89efb27ec4c7197c8792a2864e895ceb7e2f8466 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -15,6 +15,7 @@ mod outline; pub mod proto; mod syntax_map; mod task_context; +mod text_diff; mod toolchain; #[cfg(test)] @@ -62,6 +63,7 @@ use std::{num::NonZeroU32, sync::OnceLock}; use syntax_map::{QueryCursorHandle, SyntaxSnapshot}; use task::RunnableTag; pub use task_context::{ContextProvider, RunnableRange}; +pub use text_diff::{line_diff, text_diff, text_diff_with_options, unified_diff, DiffOptions}; use theme::SyntaxTheme; pub use toolchain::{LanguageToolchainStore, Toolchain, ToolchainList, ToolchainLister}; use tree_sitter::{self, wasmtime, Query, QueryCursor, WasmStore}; diff --git a/crates/language/src/text_diff.rs b/crates/language/src/text_diff.rs new file mode 100644 index 0000000000000000000000000000000000000000..83c4aba3bb275917ec274f74c264fa01d3f15989 --- /dev/null +++ b/crates/language/src/text_diff.rs @@ -0,0 +1,274 @@ +use crate::{CharClassifier, CharKind, LanguageScope}; +use imara_diff::{ + diff, + intern::{InternedInput, Token}, + sources::lines_with_terminator, + Algorithm, UnifiedDiffBuilder, +}; +use std::{iter, ops::Range, sync::Arc}; + +const MAX_WORD_DIFF_LEN: usize = 512; +const MAX_WORD_DIFF_LINE_COUNT: usize = 8; + +/// Computes a diff between two strings, returning a unified diff string. +pub fn unified_diff(old_text: &str, new_text: &str) -> String { + let input = InternedInput::new(old_text, new_text); + diff( + Algorithm::Histogram, + &input, + UnifiedDiffBuilder::new(&input), + ) +} + +/// Computes a diff between two strings, returning a vector of old and new row +/// ranges. +pub fn line_diff(old_text: &str, new_text: &str) -> Vec<(Range, Range)> { + let mut edits = Vec::new(); + let input = InternedInput::new( + lines_with_terminator(old_text), + lines_with_terminator(new_text), + ); + diff_internal(&input, |_, _, old_rows, new_rows| { + edits.push((old_rows, new_rows)); + }); + edits +} + +/// Computes a diff between two strings, returning a vector of edits. +/// +/// The edits are represented as tuples of byte ranges and replacement strings. +/// +/// Internally, this function first performs a line-based diff, and then performs a second +/// word-based diff within hunks that replace small numbers of lines. +pub fn text_diff(old_text: &str, new_text: &str) -> Vec<(Range, Arc)> { + text_diff_with_options(old_text, new_text, DiffOptions::default()) +} + +pub struct DiffOptions { + pub language_scope: Option, + pub max_word_diff_len: usize, + pub max_word_diff_line_count: usize, +} + +impl Default for DiffOptions { + fn default() -> Self { + Self { + language_scope: Default::default(), + max_word_diff_len: MAX_WORD_DIFF_LEN, + max_word_diff_line_count: MAX_WORD_DIFF_LINE_COUNT, + } + } +} + +/// Computes a diff between two strings, using a specific language scope's +/// word characters for word-level diffing. +pub fn text_diff_with_options( + old_text: &str, + new_text: &str, + options: DiffOptions, +) -> Vec<(Range, Arc)> { + let empty: Arc = Arc::default(); + let mut edits = Vec::new(); + let mut hunk_input = InternedInput::default(); + let input = InternedInput::new( + lines_with_terminator(old_text), + lines_with_terminator(new_text), + ); + diff_internal( + &input, + |old_byte_range, new_byte_range, old_rows, new_rows| { + if should_perform_word_diff_within_hunk( + &old_rows, + &old_byte_range, + &new_rows, + &new_byte_range, + &options, + ) { + let old_offset = old_byte_range.start; + let new_offset = new_byte_range.start; + hunk_input.clear(); + hunk_input.update_before(tokenize( + &old_text[old_byte_range.clone()], + options.language_scope.clone(), + )); + hunk_input.update_after(tokenize( + &new_text[new_byte_range.clone()], + options.language_scope.clone(), + )); + diff_internal(&hunk_input, |old_byte_range, new_byte_range, _, _| { + let old_byte_range = + old_offset + old_byte_range.start..old_offset + old_byte_range.end; + let new_byte_range = + new_offset + new_byte_range.start..new_offset + new_byte_range.end; + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range.clone()].into() + }; + edits.push((old_byte_range, replacement_text)); + }); + } else { + let replacement_text = if new_byte_range.is_empty() { + empty.clone() + } else { + new_text[new_byte_range.clone()].into() + }; + edits.push((old_byte_range.clone(), replacement_text)); + } + }, + ); + edits +} + +fn should_perform_word_diff_within_hunk( + old_row_range: &Range, + old_byte_range: &Range, + new_row_range: &Range, + new_byte_range: &Range, + options: &DiffOptions, +) -> bool { + !old_byte_range.is_empty() + && !new_byte_range.is_empty() + && old_byte_range.len() <= options.max_word_diff_len + && new_byte_range.len() <= options.max_word_diff_len + && old_row_range.len() <= options.max_word_diff_line_count + && new_row_range.len() <= options.max_word_diff_line_count +} + +fn diff_internal( + input: &InternedInput<&str>, + mut on_change: impl FnMut(Range, Range, Range, Range), +) { + let mut old_offset = 0; + let mut new_offset = 0; + let mut old_token_ix = 0; + let mut new_token_ix = 0; + diff( + Algorithm::Histogram, + input, + |old_tokens: Range, new_tokens: Range| { + old_offset += token_len( + &input, + &input.before[old_token_ix as usize..old_tokens.start as usize], + ); + new_offset += token_len( + &input, + &input.after[new_token_ix as usize..new_tokens.start as usize], + ); + let old_len = token_len( + &input, + &input.before[old_tokens.start as usize..old_tokens.end as usize], + ); + let new_len = token_len( + &input, + &input.after[new_tokens.start as usize..new_tokens.end as usize], + ); + let old_byte_range = old_offset..old_offset + old_len; + let new_byte_range = new_offset..new_offset + new_len; + old_token_ix = old_tokens.end; + new_token_ix = new_tokens.end; + old_offset = old_byte_range.end; + new_offset = new_byte_range.end; + on_change(old_byte_range, new_byte_range, old_tokens, new_tokens); + }, + ); +} + +fn tokenize(text: &str, language_scope: Option) -> impl Iterator { + let classifier = CharClassifier::new(language_scope).for_completion(true); + let mut chars = text.char_indices(); + let mut prev = None; + let mut start_ix = 0; + iter::from_fn(move || { + while let Some((ix, c)) = chars.next() { + let mut token = None; + let kind = classifier.kind(c); + if let Some((prev_char, prev_kind)) = prev { + if kind != prev_kind || (kind == CharKind::Punctuation && c != prev_char) { + token = Some(&text[start_ix..ix]); + start_ix = ix; + } + } + prev = Some((c, kind)); + if token.is_some() { + return token; + } + } + if start_ix < text.len() { + let token = &text[start_ix..]; + start_ix = text.len(); + return Some(token); + } + None + }) +} + +fn token_len(input: &InternedInput<&str>, tokens: &[Token]) -> usize { + tokens + .iter() + .map(|token| input.interner[*token].len()) + .sum() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize() { + let text = ""; + assert_eq!(tokenize(text, None).collect::>(), Vec::<&str>::new()); + + let text = " "; + assert_eq!(tokenize(text, None).collect::>(), vec![" "]); + + let text = "one"; + assert_eq!(tokenize(text, None).collect::>(), vec!["one"]); + + let text = "one\n"; + assert_eq!(tokenize(text, None).collect::>(), vec!["one", "\n"]); + + let text = "one.two(three)"; + assert_eq!( + tokenize(text, None).collect::>(), + vec!["one", ".", "two", "(", "three", ")"] + ); + + let text = "one two three()"; + assert_eq!( + tokenize(text, None).collect::>(), + vec!["one", " ", "two", " ", "three", "(", ")"] + ); + + let text = " one\n two three"; + assert_eq!( + tokenize(text, None).collect::>(), + vec![" ", "one", "\n ", "two", " ", "three"] + ); + } + + #[test] + fn test_text_diff() { + let old_text = "one two three"; + let new_text = "one TWO three"; + assert_eq!(text_diff(old_text, new_text), [(4..7, "TWO".into()),]); + + let old_text = "one\ntwo\nthree\n"; + let new_text = "one\ntwo\nAND\nTHEN\nthree\n"; + assert_eq!( + text_diff(old_text, new_text), + [(8..8, "AND\nTHEN\n".into()),] + ); + + let old_text = "one two\nthree four five\nsix seven eight nine\nten\n"; + let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n"; + assert_eq!( + text_diff(old_text, new_text), + [ + (14..18, "FOUR".into()), + (28..33, "SEVEN".into()), + (49..49, "ELEVEN\n".into()) + ] + ); + } +} diff --git a/crates/project/Cargo.toml b/crates/project/Cargo.toml index ea5a852f781223f3626d86963812eff87a949014..d2d571a81c1f3ad70542a0f31aee11812a0c0817 100644 --- a/crates/project/Cargo.toml +++ b/crates/project/Cargo.toml @@ -64,7 +64,6 @@ settings.workspace = true sha2.workspace = true shellexpand.workspace = true shlex.workspace = true -similar = "1.3" smol.workspace = true snippet.workspace = true snippet_provider.workspace = true diff --git a/crates/project/src/lsp_store.rs b/crates/project/src/lsp_store.rs index 427ebd1b5962a1eab60aa10640f80df1f9a3b3b8..632b96c8b787a03c396554f5bf7550033ce529b2 100644 --- a/crates/project/src/lsp_store.rs +++ b/crates/project/src/lsp_store.rs @@ -63,7 +63,6 @@ use rpc::{ use serde::Serialize; use settings::{Settings, SettingsLocation, SettingsStore}; use sha2::{Digest, Sha256}; -use similar::{ChangeTag, TextDiff}; use smol::channel::Sender; use snippet::Snippet; use std::{ @@ -110,15 +109,13 @@ pub enum LspFormatTarget { Ranges(BTreeMap>>), } -// proto::RegisterBufferWithLanguageServer {} - pub type OpenLspBufferHandle = Entity>; // Currently, formatting operations are represented differently depending on // whether they come from a language server or an external command. #[derive(Debug)] pub enum FormatOperation { - Lsp(Vec<(Range, String)>), + Lsp(Vec<(Range, Arc)>), External(Diff), Prettier(Diff), } @@ -1388,7 +1385,7 @@ impl LocalLspStore { language_server: &Arc, settings: &LanguageSettings, cx: &mut AsyncApp, - ) -> Result, String)>> { + ) -> Result, Arc)>> { let capabilities = &language_server.capabilities(); let range_formatting_provider = capabilities.document_range_formatting_provider.as_ref(); if range_formatting_provider.map_or(false, |provider| provider == &OneOf::Left(false)) { @@ -1459,7 +1456,7 @@ impl LocalLspStore { language_server: &Arc, settings: &LanguageSettings, cx: &mut AsyncApp, - ) -> Result, String)>> { + ) -> Result, Arc)>> { let uri = lsp::Url::from_file_path(abs_path) .map_err(|_| anyhow!("failed to convert abs path to uri"))?; let text_document = lsp::TextDocumentIdentifier::new(uri); @@ -2144,7 +2141,7 @@ impl LocalLspStore { server_id: LanguageServerId, version: Option, cx: &mut Context, - ) -> Task, String)>>> { + ) -> Task, Arc)>>> { let snapshot = self.buffer_snapshot_for_lsp_version(buffer, server_id, version, cx); cx.background_spawn(async move { let snapshot = snapshot?; @@ -2192,48 +2189,23 @@ impl LocalLspStore { // we can identify the changes more precisely, preserving the locations // of any anchors positioned in the unchanged regions. if range.end.row > range.start.row { - let mut offset = range.start.to_offset(&snapshot); + let offset = range.start.to_offset(&snapshot); let old_text = snapshot.text_for_range(range).collect::(); - - let diff = TextDiff::from_lines(old_text.as_str(), &new_text); - let mut moved_since_edit = true; - for change in diff.iter_all_changes() { - let tag = change.tag(); - let value = change.value(); - match tag { - ChangeTag::Equal => { - offset += value.len(); - moved_since_edit = true; - } - ChangeTag::Delete => { - let start = snapshot.anchor_after(offset); - let end = snapshot.anchor_before(offset + value.len()); - if moved_since_edit { - edits.push((start..end, String::new())); - } else { - edits.last_mut().unwrap().0.end = end; - } - offset += value.len(); - moved_since_edit = false; - } - ChangeTag::Insert => { - if moved_since_edit { - let anchor = snapshot.anchor_after(offset); - edits.push((anchor..anchor, value.to_string())); - } else { - edits.last_mut().unwrap().1.push_str(value); - } - moved_since_edit = false; - } - } - } + let range_edits = language::text_diff(old_text.as_str(), &new_text); + edits.extend(range_edits.into_iter().map(|(range, replacement)| { + ( + snapshot.anchor_after(offset + range.start) + ..snapshot.anchor_before(offset + range.end), + replacement, + ) + })); } else if range.end == range.start { let anchor = snapshot.anchor_after(range.start); - edits.push((anchor..anchor, new_text)); + edits.push((anchor..anchor, new_text.into())); } else { let edit_start = snapshot.anchor_after(range.start); let edit_end = snapshot.anchor_before(range.end); - edits.push((edit_start..edit_end, new_text)); + edits.push((edit_start..edit_end, new_text.into())); } } diff --git a/crates/project/src/project_tests.rs b/crates/project/src/project_tests.rs index 1a3d753a21ef62d9df6c65d5ba9e2da5e85e2f22..150fc9fa56c3daf59c6929f75f4cd97c12897c11 100644 --- a/crates/project/src/project_tests.rs +++ b/crates/project/src/project_tests.rs @@ -25,7 +25,11 @@ use std::{mem, num::NonZeroU32, ops::Range, task::Poll}; use task::{ResolvedTask, TaskContext}; use unindent::Unindent as _; use util::{ - assert_set_eq, path, paths::PathMatcher, separator, test::TempTree, uri, TryFutureExt as _, + assert_set_eq, path, + paths::PathMatcher, + separator, + test::{marked_text_offsets, TempTree}, + uri, TryFutureExt as _, }; #[gpui::test] @@ -3625,7 +3629,8 @@ async fn test_buffer_is_dirty(cx: &mut gpui::TestAppContext) { async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { init_test(cx); - let initial_contents = "aaa\nbbbbb\nc\n"; + let (initial_contents, initial_offsets) = + marked_text_offsets("one twoˇ\nthree ˇfourˇ five\nsixˇ seven\n"); let fs = FakeFs::new(cx.executor()); fs.insert_tree( path!("/dir"), @@ -3640,8 +3645,9 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { .await .unwrap(); - let anchors = (0..3) - .map(|row| buffer.update(cx, |b, _| b.anchor_before(Point::new(row, 1)))) + let anchors = initial_offsets + .iter() + .map(|offset| buffer.update(cx, |b, _| b.anchor_before(offset))) .collect::>(); // Change the file on disk, adding two new lines of text, and removing @@ -3650,10 +3656,12 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { assert!(!buffer.is_dirty()); assert!(!buffer.has_conflict()); }); - let new_contents = "AAAA\naaa\nBB\nbbbbb\n"; + + let (new_contents, new_offsets) = + marked_text_offsets("oneˇ\nthree ˇFOURˇ five\nsixtyˇ seven\n"); fs.save( path!("/dir/the-file").as_ref(), - &new_contents.into(), + &new_contents.as_str().into(), LineEnding::Unix, ) .await @@ -3668,14 +3676,11 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { assert!(!buffer.is_dirty()); assert!(!buffer.has_conflict()); - let anchor_positions = anchors + let anchor_offsets = anchors .iter() - .map(|anchor| anchor.to_point(&*buffer)) + .map(|anchor| anchor.to_offset(&*buffer)) .collect::>(); - assert_eq!( - anchor_positions, - [Point::new(1, 1), Point::new(3, 1), Point::new(3, 5)] - ); + assert_eq!(anchor_offsets, new_offsets); }); // Modify the buffer @@ -3698,6 +3703,7 @@ async fn test_buffer_file_changes_on_disk(cx: &mut gpui::TestAppContext) { // marked as having a conflict. cx.executor().run_until_parked(); buffer.update(cx, |buffer, _| { + assert_eq!(buffer.text(), " ".to_string() + &new_contents); assert!(buffer.has_conflict()); }); } diff --git a/crates/zeta/Cargo.toml b/crates/zeta/Cargo.toml index f8f0bb4da3d4946b743d25c547ca6bc59eb1d099..515624962a6b94667f06b64c803b7ecdb9e22a13 100644 --- a/crates/zeta/Cargo.toml +++ b/crates/zeta/Cargo.toml @@ -45,7 +45,6 @@ release_channel.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true -similar.workspace = true telemetry.workspace = true telemetry_events.workspace = true theme.workspace = true diff --git a/crates/zeta/src/zeta.rs b/crates/zeta/src/zeta.rs index f7f335469c5263b8e88f01134f74781b7a602a74..82fd0e9991d19fa7bcef859589d2834acc064397 100644 --- a/crates/zeta/src/zeta.rs +++ b/crates/zeta/src/zeta.rs @@ -29,8 +29,7 @@ use gpui::{ use http_client::{HttpClient, Method}; use input_excerpt::excerpt_for_cursor_position; use language::{ - Anchor, Buffer, BufferSnapshot, CharClassifier, CharKind, EditPreview, OffsetRangeExt, - ToOffset, ToPoint, + text_diff, Anchor, Buffer, BufferSnapshot, EditPreview, OffsetRangeExt, ToOffset, ToPoint, }; use language_models::LlmApiToken; use postage::watch; @@ -919,77 +918,18 @@ and then another offset: usize, snapshot: &BufferSnapshot, ) -> Vec<(Range, String)> { - fn tokenize(text: &str) -> Vec<&str> { - let classifier = CharClassifier::new(None).for_completion(true); - let mut chars = text.chars().peekable(); - let mut prev_ch = chars.peek().copied(); - let mut tokens = Vec::new(); - let mut start = 0; - let mut end = 0; - while let Some(ch) = chars.next() { - let prev_kind = prev_ch.map(|ch| classifier.kind(ch)); - let kind = classifier.kind(ch); - if Some(kind) != prev_kind || (kind == CharKind::Punctuation && Some(ch) != prev_ch) - { - tokens.push(&text[start..end]); - start = end; - } - end += ch.len_utf8(); - prev_ch = Some(ch); - } - tokens.push(&text[start..end]); - tokens - } - - let old_tokens = tokenize(&old_text); - let new_tokens = tokenize(new_text); - - let diff = similar::TextDiffConfig::default() - .algorithm(similar::Algorithm::Patience) - .diff_slices(&old_tokens, &new_tokens); - let mut edits: Vec<(Range, String)> = Vec::new(); - let mut old_start = offset; - for change in diff.iter_all_changes() { - let value = change.value(); - match change.tag() { - similar::ChangeTag::Equal => { - old_start += value.len(); - } - similar::ChangeTag::Delete => { - let old_end = old_start + value.len(); - if let Some((last_old_range, _)) = edits.last_mut() { - if last_old_range.end == old_start { - last_old_range.end = old_end; - } else { - edits.push((old_start..old_end, String::new())); - } - } else { - edits.push((old_start..old_end, String::new())); - } - old_start = old_end; - } - similar::ChangeTag::Insert => { - if let Some((last_old_range, last_new_text)) = edits.last_mut() { - if last_old_range.end == old_start { - last_new_text.push_str(value); - } else { - edits.push((old_start..old_start, value.into())); - } - } else { - edits.push((old_start..old_start, value.into())); - } - } - } - } - - edits + text_diff(&old_text, &new_text) .into_iter() .map(|(mut old_range, new_text)| { + old_range.start += offset; + old_range.end += offset; + let prefix_len = common_prefix( snapshot.chars_for_range(old_range.clone()), new_text.chars(), ); old_range.start += prefix_len; + let suffix_len = common_prefix( snapshot.reversed_chars_for_range(old_range.clone()), new_text[prefix_len..].chars().rev(), @@ -1248,10 +1188,7 @@ impl Event { writeln!(prompt, "User renamed {:?} to {:?}\n", old_path, new_path).unwrap(); } - let diff = - similar::TextDiff::from_lines(&old_snapshot.text(), &new_snapshot.text()) - .unified_diff() - .to_string(); + let diff = language::unified_diff(&old_snapshot.text(), &new_snapshot.text()); if !diff.is_empty() { write!( prompt,