- Update encoding detection to automatically switch to UTF-16LE/BE if

R Aadarsh created

BOM is present

- Change `encoding` in `Buffer` to `Arc<Mutex<&\'static Encoding>>`

- Observe changes in the `encoding` field of `Buffer` and update the
status bar indicator

Change summary

crates/copilot/src/copilot.rs      |  8 ++++
crates/encodings/src/lib.rs        | 49 ++++++++++++++++++++++++++-----
crates/encodings/src/selectors.rs  |  7 ++-
crates/fs/src/encodings.rs         | 49 ++++++++++++++++++++++++++-----
crates/fs/src/fs.rs                | 24 ++++++++++++---
crates/git_ui/src/git_panel.rs     |  2 
crates/language/src/buffer.rs      | 36 ++++++++++++++++------
crates/project/src/buffer_store.rs | 37 +++++++++++------------
crates/worktree/src/worktree.rs    | 13 +++++++-
crates/zed/src/zed.rs              |  5 +-
10 files changed, 170 insertions(+), 60 deletions(-)

Detailed changes

crates/copilot/src/copilot.rs 🔗

@@ -1460,7 +1460,13 @@ mod tests {
             unimplemented!()
         }
 
-        fn load_with_encoding(&self, _: &App, _: &'static Encoding) -> Task<Result<String>> {
+        fn load_with_encoding(
+            &self,
+            _: &App,
+            _: &'static Encoding,
+            _: bool,
+            _: Arc<std::sync::Mutex<&'static Encoding>>,
+        ) -> Task<Result<String>> {
             unimplemented!()
         }
     }

crates/encodings/src/lib.rs 🔗

@@ -1,7 +1,9 @@
 //! A crate for handling file encodings in the text editor.
+
 use editor::{Editor, EditorSettings};
 use encoding_rs::Encoding;
 use gpui::{ClickEvent, Entity, Subscription, WeakEntity};
+use language::Buffer;
 use settings::Settings;
 use ui::{Button, ButtonCommon, Context, LabelSize, Render, Tooltip, Window, div};
 use ui::{Clickable, ParentElement};
@@ -13,7 +15,13 @@ use crate::selectors::save_or_reopen::EncodingSaveOrReopenSelector;
 pub struct EncodingIndicator {
     pub encoding: Option<&'static Encoding>,
     pub workspace: WeakEntity<Workspace>,
-    observe: Option<Subscription>, // Subscription to observe changes in the active editor
+
+    /// Subscription to observe changes in the active editor
+    observe_editor: Option<Subscription>,
+
+    /// Subscription to observe changes in the `encoding` field of the `Buffer` struct
+    observe_buffer_encoding: Option<Subscription>,
+
     show: bool, // Whether to show the indicator or not, based on whether an editor is active
 }
 
@@ -50,17 +58,20 @@ impl EncodingIndicator {
     pub fn new(
         encoding: Option<&'static Encoding>,
         workspace: WeakEntity<Workspace>,
-        observe: Option<Subscription>,
+        observe_editor: Option<Subscription>,
+        observe_buffer_encoding: Option<Subscription>,
     ) -> EncodingIndicator {
         EncodingIndicator {
             encoding,
             workspace,
-            observe,
+            observe_editor,
             show: true,
+            observe_buffer_encoding,
         }
     }
 
-    pub fn update(
+    /// Update the encoding when the active editor is switched.
+    pub fn update_when_editor_is_switched(
         &mut self,
         editor: Entity<Editor>,
         _: &mut Window,
@@ -68,12 +79,24 @@ impl EncodingIndicator {
     ) {
         let editor = editor.read(cx);
         if let Some((_, buffer, _)) = editor.active_excerpt(cx) {
-            let encoding = buffer.read(cx).encoding;
-            self.encoding = Some(encoding);
+            let encoding = buffer.read(cx).encoding.clone();
+            self.encoding = Some(&*encoding.lock().unwrap());
         }
 
         cx.notify();
     }
+
+    /// Update the encoding when the `encoding` field of the `Buffer` struct changes.
+    pub fn update_when_buffer_encoding_changes(
+        &mut self,
+        buffer: Entity<Buffer>,
+        _: &mut Window,
+        cx: &mut Context<EncodingIndicator>,
+    ) {
+        let encoding = buffer.read(cx).encoding.clone();
+        self.encoding = Some(&*encoding.lock().unwrap());
+        cx.notify();
+    }
 }
 
 impl StatusItemView for EncodingIndicator {
@@ -85,13 +108,21 @@ impl StatusItemView for EncodingIndicator {
     ) {
         match active_pane_item.and_then(|item| item.downcast::<Editor>()) {
             Some(editor) => {
-                self.observe = Some(cx.observe_in(&editor, window, Self::update));
-                self.update(editor, window, cx);
+                self.observe_editor =
+                    Some(cx.observe_in(&editor, window, Self::update_when_editor_is_switched));
+                if let Some((_, buffer, _)) = &editor.read(cx).active_excerpt(cx) {
+                    self.observe_buffer_encoding = Some(cx.observe_in(
+                        buffer,
+                        window,
+                        Self::update_when_buffer_encoding_changes,
+                    ));
+                }
+                self.update_when_editor_is_switched(editor, window, cx);
                 self.show = true;
             }
             None => {
                 self.encoding = None;
-                self.observe = None;
+                self.observe_editor = None;
                 self.show = false;
             }
         }

crates/encodings/src/selectors.rs 🔗

@@ -409,7 +409,6 @@ pub mod encoding {
                     )
                     .await
                 }
-
                 picker
                     .update(cx, |picker, cx| {
                         let delegate = &mut picker.delegate;
@@ -426,11 +425,13 @@ pub mod encoding {
         fn confirm(&mut self, _: bool, window: &mut Window, cx: &mut Context<Picker<Self>>) {
             if let Some(buffer) = self.buffer.upgrade() {
                 buffer.update(cx, |buffer, cx| {
-                    buffer.encoding =
+                    let buffer_encoding = buffer.encoding.clone();
+                    let buffer_encoding = &mut *buffer_encoding.lock().unwrap();
+                    *buffer_encoding =
                         encoding_from_name(self.matches[self.current_selection].string.as_str());
                     if self.action == Action::Reopen {
                         let executor = cx.background_executor().clone();
-                        executor.spawn(buffer.reload(cx)).detach();
+                        executor.spawn(buffer.reload(cx, true)).detach();
                     } else if self.action == Action::Save {
                         let executor = cx.background_executor().clone();
 

crates/fs/src/encodings.rs 🔗

@@ -1,5 +1,8 @@
 //! Encoding and decoding utilities using the `encoding_rs` crate.
-use std::fmt::Debug;
+use std::{
+    fmt::Debug,
+    sync::{Arc, Mutex},
+};
 
 use anyhow::Result;
 use encoding_rs::Encoding;
@@ -42,7 +45,34 @@ impl EncodingWrapper {
         self.0
     }
 
-    pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
+    pub async fn decode(
+        &mut self,
+        input: Vec<u8>,
+        force: bool,
+        buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
+    ) -> Result<String> {
+        // Check if the input starts with a BOM for UTF-16 encodings only if not forced to
+        // use the encoding specified.
+        if !force {
+            if (input[0] == 0xFF) & (input[1] == 0xFE) {
+                self.0 = encoding_rs::UTF_16LE;
+
+                if let Some(v) = buffer_encoding {
+                    if let Ok(mut v) = (*v).lock() {
+                        *v = encoding_rs::UTF_16LE;
+                    }
+                }
+            } else if (input[0] == 0xFE) & (input[1] == 0xFF) {
+                self.0 = encoding_rs::UTF_16BE;
+
+                if let Some(v) = buffer_encoding {
+                    if let Ok(mut v) = (*v).lock() {
+                        *v = encoding_rs::UTF_16BE;
+                    }
+                }
+            }
+        }
+
         let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
 
         // `encoding_rs` handles invalid bytes by replacing them with replacement characters
@@ -53,8 +83,7 @@ impl EncodingWrapper {
 
     pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
         if self.0 == encoding_rs::UTF_16BE {
-            let mut data = Vec::<u8>::new();
-            data.reserve(input.len() * 2); // Reserve space for UTF-16BE bytes
+            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 
             // Convert the input string to UTF-16BE bytes
             let utf16be_bytes: Vec<u8> =
@@ -63,8 +92,7 @@ impl EncodingWrapper {
             data.extend(utf16be_bytes);
             return Ok(data);
         } else if self.0 == encoding_rs::UTF_16LE {
-            let mut data = Vec::<u8>::new();
-            data.reserve(input.len() * 2); // Reserve space for UTF-16LE bytes
+            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 
             // Convert the input string to UTF-16LE bytes
             let utf16le_bytes: Vec<u8> =
@@ -83,8 +111,13 @@ impl EncodingWrapper {
 }
 
 /// Convert a byte vector from a specified encoding to a UTF-8 string.
-pub async fn to_utf8(input: Vec<u8>, encoding: EncodingWrapper) -> Result<String> {
-    encoding.decode(input).await
+pub async fn to_utf8(
+    input: Vec<u8>,
+    mut encoding: EncodingWrapper,
+    force: bool,
+    buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
+) -> Result<String> {
+    encoding.decode(input, force, buffer_encoding).await
 }
 
 /// Convert a UTF-8 string to a byte vector in a specified encoding.

crates/fs/src/fs.rs 🔗

@@ -9,6 +9,7 @@ use anyhow::{Context as _, Result, anyhow};
 #[cfg(any(target_os = "linux", target_os = "freebsd"))]
 use ashpd::desktop::trash;
 use futures::stream::iter;
+use encoding_rs::Encoding;
 use gpui::App;
 use gpui::BackgroundExecutor;
 use gpui::Global;
@@ -124,8 +125,20 @@ pub trait Fs: Send + Sync {
         &self,
         path: PathBuf,
         encoding: EncodingWrapper,
+        force: bool, // if true, ignore BOM and use the specified encoding,
+
+        // The current encoding of the buffer. BOM (if it exists) is checked
+        // to find if encoding is UTF-16, and if so, the encoding is updated to UTF-16
+        // regardless of the value of `encoding`.
+        buffer_encoding: Arc<std::sync::Mutex<&'static Encoding>>,
     ) -> anyhow::Result<String> {
-        Ok(encodings::to_utf8(self.load_bytes(path.as_path()).await?, encoding).await?)
+        Ok(encodings::to_utf8(
+            self.load_bytes(path.as_path()).await?,
+            encoding,
+            force,
+            Some(buffer_encoding.clone()),
+        )
+        .await?)
     }
 
     async fn load_bytes(&self, path: &Path) -> Result<Vec<u8>>;
@@ -619,10 +632,11 @@ impl Fs for RealFs {
     async fn load(&self, path: &Path) -> Result<String> {
         let path = path.to_path_buf();
         let encoding = EncodingWrapper::new(encoding_rs::UTF_8);
-        let text =
-            smol::unblock(async || Ok(encodings::to_utf8(std::fs::read(path)?, encoding).await?))
-                .await
-                .await;
+        let text = smol::unblock(async || {
+            Ok(encodings::to_utf8(std::fs::read(path)?, encoding, false, None).await?)
+        })
+        .await
+        .await;
         text
     }
 

crates/git_ui/src/git_panel.rs 🔗

@@ -1056,7 +1056,7 @@ impl GitPanel {
                     .iter()
                     .filter_map(|buffer| {
                         buffer.as_ref().ok()?.update(cx, |buffer, cx| {
-                            buffer.is_dirty().then(|| buffer.reload(cx))
+                            buffer.is_dirty().then(|| buffer.reload(cx, false))
                         })
                     })
                     .collect()

crates/language/src/buffer.rs 🔗

@@ -127,7 +127,7 @@ pub struct Buffer {
     has_unsaved_edits: Cell<(clock::Global, bool)>,
     change_bits: Vec<rc::Weak<Cell<bool>>>,
     _subscriptions: Vec<gpui::Subscription>,
-    pub encoding: &'static Encoding,
+    pub encoding: Arc<std::sync::Mutex<&'static Encoding>>,
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -420,7 +420,13 @@ pub trait LocalFile: File {
     fn load_bytes(&self, cx: &App) -> Task<Result<Vec<u8>>>;
 
     /// Loads the file contents from disk, decoding them with the given encoding.
-    fn load_with_encoding(&self, cx: &App, encoding: &'static Encoding) -> Task<Result<String>>;
+    fn load_with_encoding(
+        &self,
+        cx: &App,
+        encoding: &'static Encoding,
+        force: bool, // whether to force the encoding even if a BOM is present
+        buffer_encoding: Arc<std::sync::Mutex<&'static Encoding>>,
+    ) -> Task<Result<String>>;
 }
 
 /// The auto-indent behavior associated with an editing operation.
@@ -1011,7 +1017,7 @@ impl Buffer {
             has_conflict: false,
             change_bits: Default::default(),
             _subscriptions: Vec::new(),
-            encoding: encoding_rs::UTF_8,
+            encoding: Arc::new(std::sync::Mutex::new(encoding_rs::UTF_8)),
         }
     }
 
@@ -1345,17 +1351,21 @@ impl Buffer {
     }
 
     /// Reloads the contents of the buffer from disk.
-    pub fn reload(&mut self, cx: &Context<Self>) -> oneshot::Receiver<Option<Transaction>> {
+    pub fn reload(
+        &mut self,
+        cx: &Context<Self>,
+        force: bool, // whether to force the encoding even if a BOM is present
+    ) -> oneshot::Receiver<Option<Transaction>> {
         let (tx, rx) = futures::channel::oneshot::channel();
-        let encoding = self.encoding;
+        let encoding = self.encoding.clone();
+
         let prev_version = self.text.version();
         self.reload_task = Some(cx.spawn(async move |this, cx| {
             let Some((new_mtime, new_text)) = this.update(cx, |this, cx| {
                 let file = this.file.as_ref()?.as_local()?;
-                Some((
-                    file.disk_state().mtime(),
-                    file.load_with_encoding(cx, encoding),
-                ))
+                Some((file.disk_state().mtime(), {
+                    file.load_with_encoding(cx, &*encoding.lock().unwrap(), force, encoding.clone())
+                }))
             })?
             else {
                 return Ok(());
@@ -5237,7 +5247,13 @@ impl LocalFile for TestFile {
         unimplemented!()
     }
 
-    fn load_with_encoding(&self, _: &App, _: &'static Encoding) -> Task<Result<String>> {
+    fn load_with_encoding(
+        &self,
+        _: &App,
+        _: &'static Encoding,
+        _: bool, // whether to force the encoding even if a BOM is present
+        _: Arc<std::sync::Mutex<&'static Encoding>>,
+    ) -> Task<Result<String>> {
         unimplemented!()
     }
 }

crates/project/src/buffer_store.rs 🔗

@@ -387,7 +387,7 @@ impl LocalBufferStore {
         let version = buffer.version();
         let buffer_id = buffer.remote_id();
         let file = buffer.file().cloned();
-        let encoding = buffer.encoding;
+        let encoding = buffer.encoding.clone();
 
         if file
             .as_ref()
@@ -397,7 +397,13 @@ impl LocalBufferStore {
         }
 
         let save = worktree.update(cx, |worktree, cx| {
-            worktree.write_file(path.as_ref(), text, line_ending, cx, encoding)
+            worktree.write_file(
+                path.as_ref(),
+                text,
+                line_ending,
+                cx,
+                &*encoding.lock().unwrap(),
+            )
         });
 
         cx.spawn(async move |this, cx| {
@@ -629,22 +635,13 @@ impl LocalBufferStore {
     ) -> Task<Result<Entity<Buffer>>> {
         let load_file = worktree.update(cx, |worktree, cx| worktree.load_file(path.as_ref(), cx));
         cx.spawn(async move |this, cx| {
-            let path = path.clone();
-            let buffer = match load_file.await.with_context(|| {
-                format!("Could not open path: {}", path.display(PathStyle::local()))
-            }) {
-                Ok(loaded) => {
-                    let reservation = cx.reserve_entity::<Buffer>()?;
-                    let buffer_id = BufferId::from(reservation.entity_id().as_non_zero_u64());
-                    let executor = cx.background_executor().clone();
-                    let text_buffer = cx
-                        .background_spawn(async move {
-                            text::Buffer::new(ReplicaId::LOCAL, buffer_id, loaded.text, &executor)
-                        })
-                        .await;
-                    cx.insert_entity(reservation, |_| {
-                        Buffer::build(text_buffer, Some(loaded.file), Capability::ReadWrite)
-                    })?
+            let buffer = match load_buffer.await {
+                Ok(buffer) => {
+                    // Reload the buffer to trigger UTF-16 detection
+                    buffer
+                        .update(cx, |buffer, cx| buffer.reload(cx, false))?
+                        .await?;
+                    Ok(buffer)
                 }
                 Err(error) if is_not_found_error(&error) => cx.new(|cx| {
                     let buffer_id = BufferId::from(cx.entity_id().as_non_zero_u64());
@@ -723,7 +720,9 @@ impl LocalBufferStore {
         cx.spawn(async move |_, cx| {
             let mut project_transaction = ProjectTransaction::default();
             for buffer in buffers {
-                let transaction = buffer.update(cx, |buffer, cx| buffer.reload(cx))?.await?;
+                let transaction = buffer
+                    .update(cx, |buffer, cx| buffer.reload(cx, false))?
+                    .await?;
                 buffer.update(cx, |buffer, cx| {
                     if let Some(transaction) = transaction {
                         if !push_to_history {

crates/worktree/src/worktree.rs 🔗

@@ -3129,13 +3129,22 @@ impl language::LocalFile for File {
         cx.background_spawn(async move { fs.load_bytes(&abs_path).await })
     }
 
-    fn load_with_encoding(&self, cx: &App, encoding: &'static Encoding) -> Task<Result<String>> {
+    fn load_with_encoding(
+        &self,
+        cx: &App,
+        encoding: &'static Encoding,
+        force: bool, // whether to force the encoding even if there's a BOM
+        buffer_encoding: Arc<std::sync::Mutex<&'static Encoding>>,
+    ) -> Task<Result<String>> {
         let worktree = self.worktree.read(cx).as_local().unwrap();
         let path = worktree.absolutize(&self.path);
         let fs = worktree.fs.clone();
 
         let encoding = EncodingWrapper::new(encoding);
-        cx.background_spawn(async move { fs.load_with_encoding(path?, encoding).await })
+        cx.background_spawn(async move {
+            fs.load_with_encoding(path?, encoding, force, buffer_encoding)
+                .await
+        })
     }
 }
 

crates/zed/src/zed.rs 🔗

@@ -443,8 +443,9 @@ pub fn initialize_workspace(
             }
         });
 
-        let encoding_indicator =
-            cx.new(|_cx| encodings::EncodingIndicator::new(None, workspace.weak_handle(), None));
+        let encoding_indicator = cx.new(|_cx| {
+            encodings::EncodingIndicator::new(None, workspace.weak_handle(), None, None)
+        });
 
         let cursor_position =
             cx.new(|_| go_to_line::cursor_position::CursorPosition::new(workspace));