From 8e291ec404786ead399b08260b98a64f0c418719 Mon Sep 17 00:00:00 2001 From: Ichimura Tomoo Date: Tue, 27 Jan 2026 14:27:26 +0900 Subject: [PATCH] encoding: Add "reopen with encoding" (#46553) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Add "Reopen with Encoding" feature (Local/Single user) ## Summary This PR adds a "Reopen with Encoding" feature to allow users to manually specify an encoding and reload the active buffer. This feature allows users to explicitly specify the encoding and reload the file to resolve garbled text caused by incorrect detection. ## Changes 1. Added encoding picker logic to `encoding_selector` - Implemented a modal UI accessible via the command palette, shortcuts, or by clicking the encoding status in the status bar. - Allows users to select from a list of supported encodings (Shift JIS, EUC-JP, UTF-16LE, etc.). 2. Updated Buffer logic (crates/language) - Added a `force_encoding_on_next_reload` flag to the Buffer struct. - Updated the `reload` method to check this flag and apply the following logic: - **Non-Unicode (e.g., Shift JIS):** Bypasses heuristics (like BOM checks) to force the specified encoding. - **Unicode (e.g., UTF-8):** Performs standard BOM detection. This ensures that the BOM is correctly handled/consumed when switching back to UTF-8. 3. UI / Keymap - Made the encoding status in the status bar (ActiveBufferEncoding) clickable. - Added default keybindings: - macOS: cmd-k n - Linux/Windows: ctrl-k n - Windows: ctrl-k n ## Limitations & Scope To ensure stability and keep the PR focused, the following scenarios are intentionally out of scope: 1. **Collaboration and Remote Connections** - Encoding changes are disabled when collaboration (is_shared) or SSH remote connections (is_via_remote_server) are active. - **Reason:** Synchronizing encoding state changes between host/guest or handling remote reloads involves complex synchronization logic. This PR focuses on local files only. `Remote Connection (SSH/WSL)` |Via status bar|Via shortcut/command| |:---:|:---:| |remote_tooltip|remote_shortcut| `Collaboration Session ` |Via status bar|Via shortcut/command| |:---:|:---:| |collab_tooltip|collab_pop| 2. Dirty State - The feature is disabled if the buffer has unsaved changes to prevent data loss during reload. |Via status bar|Via shortcut/command| |:---:|:---:| |local_dirty_tooltip|local_dirty_pop| 3. Files detected as Binary Files that worktree detects as "binary" (e.g., UTF-16 files without BOM containing non-ASCII characters) are not opened in the editor, so this feature cannot be triggered. **Future Work**: Fixing this would require modifying crates/worktree heuristics or exposing a "Force Open as Text" action for InvalidItemView to trigger. Given the scope and impact, this is deferred to a future PR. ## Test Plan I verified the feature and BOM handling using the following scenarios: ### Preparation Used the following test files: - [**test_utf8.txt**](https://github.com/user-attachments/files/24548803/test_utf8.txt): English-only text file. No BOM. - [**test_utf8_bom.txt**](https://github.com/user-attachments/files/24548822/test_utf8_bom.txt): English-only text file. With BOM. - [**test_utf8_jp_bom.txt**](https://github.com/user-attachments/files/24548825/test_utf8_jp_bom.txt): UTF-8 with BOM file containing Japanese characters. - [**test_shiftjis_jp.txt**](https://github.com/user-attachments/files/24548827/test_shiftjis_jp.txt): Shift-JIS file containing Japanese characters (content designed to trigger misdetection, e.g., using only half-width katakana). Used an external editor (VS Code or Notepad) for verification. ### Case 1: English-only file behavior 1. Open an English-only UTF-8 file (test_utf8.txt). 2. Reopen as Shift JIS. 3. **Result:** - Text appearance remains unchanged (since ASCII is compatible). - Status bar updates to "Shift JIS". ### Case 2: Fixing Mojibake 1. Open a Shift-JIS file (test_shiftjis_jp.txt) that causes detection failure. ※Confirm it opens with mojibake 2. Select Shift JIS from the status bar selector. 3. **Result:** - Mojibake is resolved, and Japanese text is displayed correctly. - Status bar updates to "Shift JIS". ### Case 3: Unicode file with BOM behavior 1. Open an English-only UTF-8 with BOM file (test_utf8_bom.txt). 2. Reopen as `Shift JIS`. 3. **Result:** - The BOM bytes are displayed as mojibake at the beginning of the file. - The rest of the English text is displayed normally (ASCII compatibility). - Status bar updates to "Shift JIS". ### Case 4: Non-Unicode file with BOM behavior 1. Open a UTF-8 with BOM file containing Japanese (test_utf8_jp_bom.txt). 2. Reopen as Shift JIS. 3. **Result:** - The BOM bytes at the start are displayed as mojibake. - The Japanese text body is displayed as mojibake (UTF-8 bytes interpreted as Shift JIS). - Status bar updates to "Shift JIS" (no BOM indicator). ### Case 5: Revert to Unicode 1. From the state in Case 4 (Shift JIS with mojibake), reopen as UTF-8. 2. **Result:** - The BOM mojibake at the start disappears (consumed). - The text returns to normal. - Status bar updates to "UTF-8 (BOM)". ### Case 6: External BOM removal (State sync) 1. Open a UTF-8 with BOM file in Zed (test_utf8_bom.txt). 2. Open the same file in an external editor and save it as UTF-8 (No BOM). 3. Refocus Zed. 4. **Result:** - Text appearance remains unchanged. - The (BOM) indicator disappears from the status bar. - Saving in Zed and checking externally confirms the BOM is gone. ### Case 7: External BOM addition 1. From the state in Case 6 (UTF-8 No BOM), save as UTF-8 with BOM in the external editor. 2. Refocus Zed. 3. **Result:** - The (BOM) indicator appears in the status bar. - Saving in Zed and checking externally confirms the BOM is present. ### Case 8: External Encoding Change (Auto-detect sync) 1. Open an English-only UTF-8 file in Zed (`test_utf8.txt`). * *Status bar shows: "UTF-8".* 2. Open the same file in an external editor and save it as **UTF-16LE with BOM**. 3. Refocus Zed. 4. **Result:** * The text remains readable (no mojibake). * **Status bar automatically updates to "UTF-16LE (BOM)".** (Verifies that `buffer.encoding` is correctly updated during reload). Release Notes: - Added "Reopen with Encoding" feature (currently supported for local files). --------- Co-authored-by: Conrad Irwin --- Cargo.lock | 6 + assets/keymaps/default-linux.json | 1 + assets/keymaps/default-macos.json | 1 + assets/keymaps/default-windows.json | 1 + assets/keymaps/macos/atom.json | 1 + crates/encoding_selector/Cargo.toml | 5 + .../src/active_buffer_encoding.rs | 65 +++- .../src/encoding_selector.rs | 325 +++++++++++++++++- crates/language/src/buffer.rs | 101 +++++- crates/project/Cargo.toml | 1 + .../tests/integration/project_tests.rs | 65 ++++ crates/zed/src/main.rs | 1 + crates/zed/src/zed.rs | 1 + 13 files changed, 550 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0329e18e32a943736231256a1abb4f92b42e1735..aca65c0421f5554be408eb027066e53fcfc87b07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5643,8 +5643,13 @@ version = "0.1.0" dependencies = [ "editor", "encoding_rs", + "fuzzy", "gpui", + "language", + "picker", + "project", "ui", + "util", "workspace", ] @@ -12661,6 +12666,7 @@ dependencies = [ "dap", "dap_adapters", "db", + "encoding_rs", "extension", "fancy-regex", "fs", diff --git a/assets/keymaps/default-linux.json b/assets/keymaps/default-linux.json index 920bd24da6f2c9431bc162deb5b2f2df97bc3a28..f612bbdd0831463ad5652375989871db8556b792 100644 --- a/assets/keymaps/default-linux.json +++ b/assets/keymaps/default-linux.json @@ -618,6 +618,7 @@ "ctrl-?": "agent::ToggleFocus", "alt-save": "workspace::SaveAll", "ctrl-alt-s": "workspace::SaveAll", + "ctrl-k n": "encoding_selector::Toggle", "ctrl-k m": "language_selector::Toggle", "ctrl-k ctrl-m": "toolchain::AddToolchain", "escape": "workspace::Unfollow", diff --git a/assets/keymaps/default-macos.json b/assets/keymaps/default-macos.json index 3c4e3e010f5dcdd078eec974e7cab75e99c92780..a3d8738c15ba7e42d9d1fdaf1597b859c6bcd442 100644 --- a/assets/keymaps/default-macos.json +++ b/assets/keymaps/default-macos.json @@ -679,6 +679,7 @@ "cmd-shift-d": "debug_panel::ToggleFocus", "cmd-?": "agent::ToggleFocus", "cmd-alt-s": "workspace::SaveAll", + "cmd-k n": "encoding_selector::Toggle", "cmd-k m": "language_selector::Toggle", "cmd-k cmd-m": "toolchain::AddToolchain", "escape": "workspace::Unfollow", diff --git a/assets/keymaps/default-windows.json b/assets/keymaps/default-windows.json index bfb6a5b7ddd7a6bc0995f060671ec87e2370c7ca..52227e82db1dcaf0bd5e5ba98a8f8ff3d5c00372 100644 --- a/assets/keymaps/default-windows.json +++ b/assets/keymaps/default-windows.json @@ -609,6 +609,7 @@ "ctrl-shift-d": "debug_panel::ToggleFocus", "ctrl-shift-/": "agent::ToggleFocus", "ctrl-k s": "workspace::SaveAll", + "ctrl-k n": "encoding_selector::Toggle", "ctrl-k m": "language_selector::Toggle", "ctrl-m ctrl-m": "toolchain::AddToolchain", "escape": "workspace::Unfollow", diff --git a/assets/keymaps/macos/atom.json b/assets/keymaps/macos/atom.json index bf049fd3cb3eca8fe8049fa4e0810f82b10a5bbc..d39571de2dee3759d4f77f6d047655f7848b755e 100644 --- a/assets/keymaps/macos/atom.json +++ b/assets/keymaps/macos/atom.json @@ -19,6 +19,7 @@ "context": "Editor && mode == full", "bindings": { "ctrl-shift-l": "language_selector::Toggle", + "ctrl-shift-n": "encoding_selector::Toggle", "cmd-|": "pane::RevealInProjectPanel", "cmd-b": "editor::GoToDefinition", "alt-cmd-b": "editor::GoToDefinitionSplit", diff --git a/crates/encoding_selector/Cargo.toml b/crates/encoding_selector/Cargo.toml index abafc845ebccaea39e5d5e8a3e73ff6711bf5189..4903ab56d6de0298dafd1b8e21609ecf8842fad1 100644 --- a/crates/encoding_selector/Cargo.toml +++ b/crates/encoding_selector/Cargo.toml @@ -15,6 +15,11 @@ doctest = false [dependencies] editor.workspace = true encoding_rs.workspace = true +fuzzy.workspace = true gpui.workspace = true +language.workspace = true +picker.workspace = true +project.workspace = true ui.workspace = true +util.workspace = true workspace.workspace = true diff --git a/crates/encoding_selector/src/active_buffer_encoding.rs b/crates/encoding_selector/src/active_buffer_encoding.rs index a4c8eb6548ea7c0de6663b9205182a1b4da4116b..417ff241b72300aa90496f896fcf6c3ed3a363c7 100644 --- a/crates/encoding_selector/src/active_buffer_encoding.rs +++ b/crates/encoding_selector/src/active_buffer_encoding.rs @@ -1,8 +1,12 @@ +use crate::{EncodingSelector, Toggle}; + use editor::Editor; use encoding_rs::{Encoding, UTF_8}; use gpui::{ - Context, Entity, IntoElement, ParentElement, Render, Styled, Subscription, Window, div, + Context, Entity, IntoElement, ParentElement, Render, Styled, Subscription, WeakEntity, Window, + div, }; +use project::Project; use ui::{Button, ButtonCommon, Clickable, LabelSize, Tooltip}; use workspace::{ StatusBarSettings, StatusItemView, Workspace, @@ -11,30 +15,43 @@ use workspace::{ pub struct ActiveBufferEncoding { active_encoding: Option<&'static Encoding>, - //workspace: WeakEntity, + workspace: WeakEntity, + project: Entity, _observe_active_editor: Option, has_bom: bool, + is_dirty: bool, + is_shared: bool, + is_via_remote_server: bool, } impl ActiveBufferEncoding { - pub fn new(_workspace: &Workspace) -> Self { + pub fn new(workspace: &Workspace) -> Self { Self { active_encoding: None, - //workspace: workspace.weak_handle(), + workspace: workspace.weak_handle(), + project: workspace.project().clone(), _observe_active_editor: None, has_bom: false, + is_dirty: false, + is_shared: false, + is_via_remote_server: false, } } fn update_encoding(&mut self, editor: Entity, _: &mut Window, cx: &mut Context) { self.active_encoding = None; + self.has_bom = false; + self.is_dirty = false; - let editor = editor.read(cx); - if let Some((_, buffer, _)) = editor.active_excerpt(cx) { - let buffer = buffer.read(cx); + let project = self.project.read(cx); + self.is_shared = project.is_shared(); + self.is_via_remote_server = project.is_via_remote_server(); + if let Some((_, buffer, _)) = editor.read(cx).active_excerpt(cx) { + let buffer = buffer.read(cx); self.active_encoding = Some(buffer.encoding()); self.has_bom = buffer.has_bom(); + self.is_dirty = buffer.is_dirty(); } cx.notify(); @@ -58,13 +75,36 @@ impl Render for ActiveBufferEncoding { text.push_str(" (BOM)"); } + let (disabled, tooltip_text) = if self.is_dirty { + (true, "Save file to change encoding") + } else if self.is_shared { + (true, "Cannot change encoding during collaboration") + } else if self.is_via_remote_server { + (true, "Cannot change encoding of remote server file") + } else { + (false, "Reopen with Encoding") + }; + div().child( Button::new("change-encoding", text) .label_size(LabelSize::Small) - .on_click(|_, _, _cx| { - // No-op - }) - .tooltip(Tooltip::text("Current Encoding")), + .on_click(cx.listener(move |this, _, window, cx| { + if disabled { + return; + } + if let Some(workspace) = this.workspace.upgrade() { + workspace.update(cx, |workspace, cx| { + EncodingSelector::toggle(workspace, window, cx) + }); + } + })) + .tooltip(move |_window, cx| { + if disabled { + Tooltip::text(tooltip_text)(_window, cx) + } else { + Tooltip::for_action(tooltip_text, &Toggle, cx) + } + }), ) } } @@ -83,6 +123,9 @@ impl StatusItemView for ActiveBufferEncoding { } else { self.active_encoding = None; self.has_bom = false; + self.is_dirty = false; + self.is_shared = false; + self.is_via_remote_server = false; self._observe_active_editor = None; } diff --git a/crates/encoding_selector/src/encoding_selector.rs b/crates/encoding_selector/src/encoding_selector.rs index bf4f59bbaa3a93aa45ac6a3473ef6b2253dca61b..3954bf29a30a0981c25bee3eb88829a7002881ad 100644 --- a/crates/encoding_selector/src/encoding_selector.rs +++ b/crates/encoding_selector/src/encoding_selector.rs @@ -1,4 +1,327 @@ mod active_buffer_encoding; pub use active_buffer_encoding::ActiveBufferEncoding; -pub fn init() {} +use editor::Editor; +use encoding_rs::Encoding; +use fuzzy::{StringMatch, StringMatchCandidate, match_strings}; +use gpui::{ + App, AppContext, Context, DismissEvent, Entity, EventEmitter, FocusHandle, Focusable, + InteractiveElement, ParentElement, Render, Styled, Task, WeakEntity, Window, actions, +}; +use language::Buffer; +use picker::{Picker, PickerDelegate}; +use std::sync::Arc; +use ui::{HighlightedLabel, ListItem, ListItemSpacing, Toggleable, v_flex}; +use util::ResultExt; +use workspace::{ModalView, Toast, Workspace, notifications::NotificationId}; + +actions!( + encoding_selector, + [ + /// Toggles the encoding selector modal. + Toggle + ] +); + +pub fn init(cx: &mut App) { + cx.observe_new(EncodingSelector::register).detach(); +} + +pub struct EncodingSelector { + picker: Entity>, +} + +impl EncodingSelector { + fn register( + workspace: &mut Workspace, + _window: Option<&mut Window>, + _: &mut Context, + ) { + workspace.register_action(move |workspace, _: &Toggle, window, cx| { + Self::toggle(workspace, window, cx); + }); + } + + pub fn toggle( + workspace: &mut Workspace, + window: &mut Window, + cx: &mut Context, + ) -> Option<()> { + let (_, buffer, _) = workspace + .active_item(cx)? + .act_as::(cx)? + .read(cx) + .active_excerpt(cx)?; + + let buffer_handle = buffer.read(cx); + let project = workspace.project().read(cx); + + if buffer_handle.is_dirty() { + workspace.show_toast( + Toast::new( + NotificationId::unique::(), + "Save file to change encoding", + ), + cx, + ); + return Some(()); + } + if project.is_shared() { + workspace.show_toast( + Toast::new( + NotificationId::unique::(), + "Cannot change encoding during collaboration", + ), + cx, + ); + return Some(()); + } + if project.is_via_remote_server() { + workspace.show_toast( + Toast::new( + NotificationId::unique::(), + "Cannot change encoding of remote server file", + ), + cx, + ); + return Some(()); + } + + workspace.toggle_modal(window, cx, move |window, cx| { + EncodingSelector::new(buffer, window, cx) + }); + Some(()) + } + + fn new(buffer: Entity, window: &mut Window, cx: &mut Context) -> Self { + let delegate = EncodingSelectorDelegate::new(cx.entity().downgrade(), buffer); + let picker = cx.new(|cx| Picker::uniform_list(delegate, window, cx)); + Self { picker } + } +} + +impl Render for EncodingSelector { + fn render(&mut self, _window: &mut Window, _cx: &mut Context) -> impl gpui::IntoElement { + v_flex() + .key_context("EncodingSelector") + .w(gpui::rems(34.)) + .child(self.picker.clone()) + } +} + +impl Focusable for EncodingSelector { + fn focus_handle(&self, cx: &App) -> FocusHandle { + self.picker.focus_handle(cx) + } +} + +impl EventEmitter for EncodingSelector {} +impl ModalView for EncodingSelector {} + +pub struct EncodingSelectorDelegate { + encoding_selector: WeakEntity, + buffer: Entity, + encodings: Vec<&'static Encoding>, + match_candidates: Arc>, + matches: Vec, + selected_index: usize, +} + +impl EncodingSelectorDelegate { + fn new(encoding_selector: WeakEntity, buffer: Entity) -> Self { + let encodings = available_encodings(); + let match_candidates = encodings + .iter() + .enumerate() + .map(|(id, enc)| StringMatchCandidate::new(id, enc.name())) + .collect::>(); + Self { + encoding_selector, + buffer, + encodings, + match_candidates: Arc::new(match_candidates), + matches: vec![], + selected_index: 0, + } + } + + fn render_data_for_match(&self, mat: &StringMatch, cx: &App) -> String { + let candidate_encoding = self.encodings[mat.candidate_id]; + let current_encoding = self.buffer.read(cx).encoding(); + + if candidate_encoding.name() == current_encoding.name() { + format!("{} (current)", candidate_encoding.name()) + } else { + candidate_encoding.name().to_string() + } + } +} + +fn available_encodings() -> Vec<&'static Encoding> { + let mut encodings = vec![ + // Unicode + encoding_rs::UTF_8, + encoding_rs::UTF_16LE, + encoding_rs::UTF_16BE, + // Japanese + encoding_rs::SHIFT_JIS, + encoding_rs::EUC_JP, + encoding_rs::ISO_2022_JP, + // Chinese + encoding_rs::GBK, + encoding_rs::GB18030, + encoding_rs::BIG5, + // Korean + encoding_rs::EUC_KR, + // Windows / Single Byte Series + encoding_rs::WINDOWS_1252, // Western (ISO-8859-1 unified) + encoding_rs::WINDOWS_1250, // Central European + encoding_rs::WINDOWS_1251, // Cyrillic + encoding_rs::WINDOWS_1253, // Greek + encoding_rs::WINDOWS_1254, // Turkish (ISO-8859-9 unified) + encoding_rs::WINDOWS_1255, // Hebrew + encoding_rs::WINDOWS_1256, // Arabic + encoding_rs::WINDOWS_1257, // Baltic + encoding_rs::WINDOWS_1258, // Vietnamese + encoding_rs::WINDOWS_874, // Thai + // ISO-8859 Series (others) + encoding_rs::ISO_8859_2, + encoding_rs::ISO_8859_3, + encoding_rs::ISO_8859_4, + encoding_rs::ISO_8859_5, + encoding_rs::ISO_8859_6, + encoding_rs::ISO_8859_7, + encoding_rs::ISO_8859_8, + encoding_rs::ISO_8859_8_I, // Logical Hebrew + encoding_rs::ISO_8859_10, + encoding_rs::ISO_8859_13, + encoding_rs::ISO_8859_14, + encoding_rs::ISO_8859_15, + encoding_rs::ISO_8859_16, + // Cyrillic / Legacy Misc + encoding_rs::KOI8_R, + encoding_rs::KOI8_U, + encoding_rs::IBM866, + encoding_rs::MACINTOSH, + encoding_rs::X_MAC_CYRILLIC, + // NOTE: The following encodings are intentionally excluded from the list: + // + // 1. encoding_rs::REPLACEMENT + // Used internally for decoding errors. Not suitable for user selection. + // + // 2. encoding_rs::X_USER_DEFINED + // Used for binary data emulation (legacy web behavior). Not for general text editing. + ]; + + encodings.sort_by_key(|enc| enc.name()); + + encodings +} + +impl PickerDelegate for EncodingSelectorDelegate { + type ListItem = ListItem; + + fn placeholder_text(&self, _window: &mut Window, _cx: &mut App) -> Arc { + "Reopen with encoding...".into() + } + + fn match_count(&self) -> usize { + self.matches.len() + } + + fn selected_index(&self) -> usize { + self.selected_index + } + + fn set_selected_index( + &mut self, + ix: usize, + _window: &mut Window, + _: &mut Context>, + ) { + self.selected_index = ix; + } + + fn update_matches( + &mut self, + query: String, + window: &mut Window, + cx: &mut Context>, + ) -> Task<()> { + let background = cx.background_executor().clone(); + let candidates = self.match_candidates.clone(); + + cx.spawn_in(window, async move |this, cx| { + let matches = if query.is_empty() { + candidates + .iter() + .enumerate() + .map(|(index, candidate)| StringMatch { + candidate_id: index, + string: candidate.string.clone(), + positions: Vec::new(), + score: 0.0, + }) + .collect() + } else { + match_strings( + &candidates, + &query, + false, + true, + 100, + &Default::default(), + background, + ) + .await + }; + + this.update(cx, |this, cx| { + let delegate = &mut this.delegate; + delegate.matches = matches; + delegate.selected_index = delegate + .selected_index + .min(delegate.matches.len().saturating_sub(1)); + cx.notify(); + }) + .log_err(); + }) + } + + fn confirm(&mut self, _: bool, window: &mut Window, cx: &mut Context>) { + if let Some(mat) = self.matches.get(self.selected_index) { + let selected_encoding = self.encodings[mat.candidate_id]; + + self.buffer.update(cx, |buffer, cx| { + let _ = buffer.reload_with_encoding(selected_encoding, cx); + }); + } + self.dismissed(window, cx); + } + + fn dismissed(&mut self, _: &mut Window, cx: &mut Context>) { + self.encoding_selector + .update(cx, |_, cx| cx.emit(DismissEvent)) + .log_err(); + } + + fn render_match( + &self, + ix: usize, + selected: bool, + _: &mut Window, + cx: &mut Context>, + ) -> Option { + let mat = &self.matches.get(ix)?; + + let label = self.render_data_for_match(mat, cx); + + Some( + ListItem::new(ix) + .inset(true) + .spacing(ListItemSpacing::Sparse) + .toggle_state(selected) + .child(HighlightedLabel::new(label, mat.positions.clone())), + ) + } +} diff --git a/crates/language/src/buffer.rs b/crates/language/src/buffer.rs index 418abf38f3d8ca4a61403a9a3b3831c2da36c106..0621a627e0b9b4c8e96e02d9937d75438c427d7d 100644 --- a/crates/language/src/buffer.rs +++ b/crates/language/src/buffer.rs @@ -139,6 +139,7 @@ pub struct Buffer { tree_sitter_data: Arc, encoding: &'static Encoding, has_bom: bool, + reload_with_encoding_txns: HashMap, } #[derive(Debug)] @@ -1147,6 +1148,7 @@ impl Buffer { _subscriptions: Vec::new(), encoding: encoding_rs::UTF_8, has_bom: false, + reload_with_encoding_txns: HashMap::default(), } } @@ -1535,31 +1537,86 @@ impl Buffer { /// Reloads the contents of the buffer from disk. pub fn reload(&mut self, cx: &Context) -> oneshot::Receiver> { + self.reload_impl(None, cx) + } + + /// Reloads the contents of the buffer from disk using the specified encoding. + /// + /// This bypasses automatic encoding detection heuristics (like BOM checks) for non-Unicode encodings, + /// allowing users to force a specific interpretation of the bytes. + pub fn reload_with_encoding( + &mut self, + encoding: &'static Encoding, + cx: &Context, + ) -> oneshot::Receiver> { + self.reload_impl(Some(encoding), cx) + } + + fn reload_impl( + &mut self, + force_encoding: Option<&'static Encoding>, + cx: &Context, + ) -> oneshot::Receiver> { let (tx, rx) = futures::channel::oneshot::channel(); let prev_version = self.text.version(); + self.reload_task = Some(cx.spawn(async move |this, cx| { - let Some((new_mtime, load_bytes_task, encoding)) = this.update(cx, |this, cx| { - let file = this.file.as_ref()?.as_local()?; - Some(( - file.disk_state().mtime(), - file.load_bytes(cx), - this.encoding, - )) - })? + let Some((new_mtime, load_bytes_task, current_encoding)) = + this.update(cx, |this, cx| { + let file = this.file.as_ref()?.as_local()?; + Some(( + file.disk_state().mtime(), + file.load_bytes(cx), + this.encoding, + )) + })? else { return Ok(()); }; - let bytes = load_bytes_task.await?; - let (cow, _encoding_used, _has_errors) = encoding.decode(&bytes); - let new_text = cow.into_owned(); + let target_encoding = force_encoding.unwrap_or(current_encoding); + + let is_unicode = target_encoding == encoding_rs::UTF_8 + || target_encoding == encoding_rs::UTF_16LE + || target_encoding == encoding_rs::UTF_16BE; + + let (new_text, has_bom, encoding_used) = if force_encoding.is_some() && !is_unicode { + let bytes = load_bytes_task.await?; + let (cow, _had_errors) = target_encoding.decode_without_bom_handling(&bytes); + (cow.into_owned(), false, target_encoding) + } else { + let bytes = load_bytes_task.await?; + let (cow, used_enc, _had_errors) = target_encoding.decode(&bytes); + + let actual_has_bom = if used_enc == encoding_rs::UTF_8 { + bytes.starts_with(&[0xEF, 0xBB, 0xBF]) + } else if used_enc == encoding_rs::UTF_16LE { + bytes.starts_with(&[0xFF, 0xFE]) + } else if used_enc == encoding_rs::UTF_16BE { + bytes.starts_with(&[0xFE, 0xFF]) + } else { + false + }; + (cow.into_owned(), actual_has_bom, used_enc) + }; let diff = this.update(cx, |this, cx| this.diff(new_text, cx))?.await; this.update(cx, |this, cx| { if this.version() == diff.base_version { this.finalize_last_transaction(); + let old_encoding = this.encoding; + let old_has_bom = this.has_bom; this.apply_diff(diff, cx); - tx.send(this.finalize_last_transaction().cloned()).ok(); + this.encoding = encoding_used; + this.has_bom = has_bom; + let transaction = this.finalize_last_transaction().cloned(); + if let Some(ref txn) = transaction { + if old_encoding != encoding_used || old_has_bom != has_bom { + this.reload_with_encoding_txns + .insert(txn.id, (old_encoding, old_has_bom)); + } + } + tx.send(transaction).ok(); this.has_conflict = false; this.did_reload(this.version(), this.line_ending(), new_mtime, cx); } else { @@ -3044,6 +3101,7 @@ impl Buffer { if let Some((transaction_id, operation)) = self.text.undo() { self.send_operation(Operation::Buffer(operation), true, cx); self.did_edit(&old_version, was_dirty, cx); + self.restore_encoding_for_transaction(transaction_id, was_dirty); Some(transaction_id) } else { None @@ -3103,12 +3161,31 @@ impl Buffer { if let Some((transaction_id, operation)) = self.text.redo() { self.send_operation(Operation::Buffer(operation), true, cx); self.did_edit(&old_version, was_dirty, cx); + self.restore_encoding_for_transaction(transaction_id, was_dirty); Some(transaction_id) } else { None } } + fn restore_encoding_for_transaction(&mut self, transaction_id: TransactionId, was_dirty: bool) { + if let Some((old_encoding, old_has_bom)) = + self.reload_with_encoding_txns.get(&transaction_id) + { + let current_encoding = self.encoding; + let current_has_bom = self.has_bom; + self.encoding = *old_encoding; + self.has_bom = *old_has_bom; + if !was_dirty { + self.saved_version = self.version.clone(); + self.has_unsaved_edits + .set((self.saved_version.clone(), false)); + } + self.reload_with_encoding_txns + .insert(transaction_id, (current_encoding, current_has_bom)); + } + } + /// Manually undoes all changes until a given transaction in the buffer's redo history. pub fn redo_to_transaction( &mut self, diff --git a/crates/project/Cargo.toml b/crates/project/Cargo.toml index 5d8a36cca78be18a6836ee93ac9efc415039d80e..cbcd5481ee3c48655fc78e17d5cf65d2ec978a09 100644 --- a/crates/project/Cargo.toml +++ b/crates/project/Cargo.toml @@ -104,6 +104,7 @@ tracing.workspace = true [dev-dependencies] client = { workspace = true, features = ["test-support"] } +encoding_rs.workspace = true db = { workspace = true, features = ["test-support"] } collections = { workspace = true, features = ["test-support"] } context_server = { workspace = true, features = ["test-support"] } diff --git a/crates/project/tests/integration/project_tests.rs b/crates/project/tests/integration/project_tests.rs index f6ce89f7e675206a3452c9ba5471f3ccb371c28e..e24354e06b8c4098bf27a0f0745f9f857e1910ba 100644 --- a/crates/project/tests/integration/project_tests.rs +++ b/crates/project/tests/integration/project_tests.rs @@ -25,6 +25,7 @@ use buffer_diff::{ assert_hunks, }; use collections::{BTreeSet, HashMap, HashSet}; +use encoding_rs; use fs::FakeFs; use futures::{StreamExt, future}; use git::{ @@ -11113,6 +11114,70 @@ async fn search( .collect()) } +#[gpui::test] +async fn test_undo_encoding_change(cx: &mut gpui::TestAppContext) { + init_test(cx); + + let fs = FakeFs::new(cx.executor()); + + // Create a file with ASCII content "Hi" - this will be detected as UTF-8 + // When reinterpreted as UTF-16LE, the bytes 0x48 0x69 become a single character + let ascii_bytes: Vec = vec![0x48, 0x69]; + fs.insert_tree(path!("/dir"), json!({})).await; + fs.insert_file(path!("/dir/test.txt"), ascii_bytes).await; + + let project = Project::test(fs.clone(), [path!("/dir").as_ref()], cx).await; + + let buffer = project + .update(cx, |p, cx| p.open_local_buffer(path!("/dir/test.txt"), cx)) + .await + .unwrap(); + + let (initial_encoding, initial_text, initial_dirty) = buffer.read_with(cx, |buffer, _| { + (buffer.encoding(), buffer.text(), buffer.is_dirty()) + }); + assert_eq!(initial_encoding, encoding_rs::UTF_8); + assert_eq!(initial_text, "Hi"); + assert!(!initial_dirty); + + let reload_receiver = buffer.update(cx, |buffer, cx| { + buffer.reload_with_encoding(encoding_rs::UTF_16LE, cx) + }); + cx.executor().run_until_parked(); + + // Wait for reload to complete + let _ = reload_receiver.await; + + // Verify the encoding changed, text is different, and still not dirty (we reloaded from disk) + let (reloaded_encoding, reloaded_text, reloaded_dirty) = buffer.read_with(cx, |buffer, _| { + (buffer.encoding(), buffer.text(), buffer.is_dirty()) + }); + assert_eq!(reloaded_encoding, encoding_rs::UTF_16LE); + assert_eq!(reloaded_text, "楈"); + assert!(!reloaded_dirty); + + // Undo the reload + buffer.update(cx, |buffer, cx| { + buffer.undo(cx); + }); + + buffer.read_with(cx, |buffer, _| { + assert_eq!(buffer.encoding(), encoding_rs::UTF_8); + assert_eq!(buffer.text(), "Hi"); + assert!(!buffer.is_dirty()); + }); + + buffer.update(cx, |buffer, cx| { + buffer.redo(cx); + }); + + buffer.read_with(cx, |buffer, _| { + assert_eq!(buffer.encoding(), encoding_rs::UTF_16LE); + assert_ne!(buffer.text(), "Hi"); + assert!(!buffer.is_dirty()); + }); +} + pub fn init_test(cx: &mut gpui::TestAppContext) { zlog::init_test(); diff --git a/crates/zed/src/main.rs b/crates/zed/src/main.rs index a248fd11c71b92893f8b5849e14286bb5627d924..c65e4aa98584d22ddd12d92eed46e4a29d206ee3 100644 --- a/crates/zed/src/main.rs +++ b/crates/zed/src/main.rs @@ -662,6 +662,7 @@ fn main() { vim::init(cx); terminal_view::init(cx); journal::init(app_state.clone(), cx); + encoding_selector::init(cx); language_selector::init(cx); line_ending_selector::init(cx); toolchain_selector::init(cx); diff --git a/crates/zed/src/zed.rs b/crates/zed/src/zed.rs index 942b79f36bc658b274b49326d7c1dc930a3e546b..22bbca4f5c1962698fce01730873b3731d50fc88 100644 --- a/crates/zed/src/zed.rs +++ b/crates/zed/src/zed.rs @@ -4826,6 +4826,7 @@ mod tests { "diagnostics", "edit_prediction", "editor", + "encoding_selector", "feedback", "file_finder", "git",