Add a new `load_with_encoding` function to handle files with various encodings.

R Aadarsh created

Modified `Buffer::reload` in `buffer.rs` to use this new function, allowing Zed
to open files with any encoding in UTF-8 mode. Files with characters that
are invalid in UTF-8 will have those bytes replaced with the � character.

Add comments and documentation.

Change summary

Cargo.lock                        |  2 +
crates/copilot/Cargo.toml         |  2 +
crates/copilot/src/copilot.rs     |  5 +++
crates/encodings/src/lib.rs       |  6 +++
crates/encodings/src/selectors.rs |  7 ++++
crates/fs/src/encodings.rs        | 47 +++++++++++++++++++++++---------
crates/fs/src/fs.rs               | 21 ++++++++++++--
crates/language/src/buffer.rs     | 22 +++++++++++++--
crates/worktree/Cargo.toml        |  2 +
crates/worktree/src/worktree.rs   | 19 ++++++++++++
10 files changed, 111 insertions(+), 22 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -3720,6 +3720,7 @@ dependencies = [
  "dirs 4.0.0",
  "edit_prediction",
  "editor",
+ "encoding",
  "fs",
  "futures 0.3.31",
  "gpui",
@@ -20848,6 +20849,7 @@ dependencies = [
  "async-lock 2.8.0",
  "clock",
  "collections",
+ "encoding",
  "fs",
  "futures 0.3.31",
  "fuzzy",

crates/copilot/Cargo.toml 🔗

@@ -53,6 +53,8 @@ ui.workspace = true
 util.workspace = true
 workspace.workspace = true
 itertools.workspace = true
+encoding = "0.2.33"
+
 
 [target.'cfg(windows)'.dependencies]
 async-std = { version = "1.12.0", features = ["unstable"] }

crates/copilot/src/copilot.rs 🔗

@@ -1241,6 +1241,7 @@ async fn get_copilot_lsp(fs: Arc<dyn Fs>, node_runtime: NodeRuntime) -> anyhow::
 #[cfg(test)]
 mod tests {
     use super::*;
+    use encoding::Encoding;
     use gpui::TestAppContext;
     use util::{path, paths::PathStyle, rel_path::rel_path};
 
@@ -1458,6 +1459,10 @@ mod tests {
         fn load_bytes(&self, _cx: &App) -> Task<Result<Vec<u8>>> {
             unimplemented!()
         }
+
+        fn load_with_encoding(&self, _: &App, _: &'static dyn Encoding) -> Task<Result<String>> {
+            unimplemented!()
+        }
     }
 }
 

crates/encodings/src/lib.rs 🔗

@@ -14,10 +14,11 @@ use workspace::{ItemHandle, StatusItemView, Workspace};
 
 use crate::selectors::save_or_reopen::{EncodingSaveOrReopenSelector, get_current_encoding};
 
+/// A status bar item that shows the current file encoding and allows changing it.
 pub struct EncodingIndicator {
     pub encoding: Option<&'static dyn Encoding>,
     pub workspace: WeakEntity<Workspace>,
-    observe: Option<Subscription>,
+    observe: Option<Subscription>, // Subscription to observe changes in the active editor
 }
 
 pub mod selectors;
@@ -93,6 +94,7 @@ impl StatusItemView for EncodingIndicator {
     }
 }
 
+/// Get a human-readable name for the given encoding.
 pub fn encoding_name(encoding: &'static dyn Encoding) -> String {
     let name = encoding.name();
 
@@ -140,6 +142,8 @@ pub fn encoding_name(encoding: &'static dyn Encoding) -> String {
     .to_string()
 }
 
+/// Get an encoding from its index in the predefined list.
+/// If the index is out of range, UTF-8 is returned as a default.
 pub fn encoding_from_index(index: usize) -> &'static dyn Encoding {
     match index {
         0 => UTF_8,

crates/encodings/src/selectors.rs 🔗

@@ -19,6 +19,8 @@ pub mod save_or_reopen {
 
     use crate::selectors::encoding::{Action, EncodingSelector, EncodingSelectorDelegate};
 
+    /// A modal view that allows the user to select between saving with a different encoding or
+    /// reopening with a different encoding.
     pub struct EncodingSaveOrReopenSelector {
         picker: Entity<Picker<EncodingSaveOrReopenDelegate>>,
         pub current_selection: usize,
@@ -43,6 +45,8 @@ pub mod save_or_reopen {
             }
         }
 
+        /// Toggle the modal view for selecting between saving with a different encoding or
+        /// reopening with a different encoding.
         pub fn toggle(workspace: &mut Workspace, window: &mut Window, cx: &mut Context<Workspace>) {
             let weak_workspace = workspace.weak_handle();
             workspace.toggle_modal(window, cx, |window, cx| {
@@ -100,6 +104,7 @@ pub mod save_or_reopen {
             (&self.actions[0].string, &self.actions[1].string)
         }
 
+        /// Handle the action selected by the user.
         pub fn post_selection(
             &self,
             cx: &mut Context<Picker<EncodingSaveOrReopenDelegate>>,
@@ -281,6 +286,7 @@ pub mod encoding {
 
     use crate::encoding_from_index;
 
+    /// A modal view that allows the user to select an encoding from a list of encodings.
     pub struct EncodingSelector {
         picker: Entity<Picker<EncodingSelectorDelegate>>,
         action: Action,
@@ -459,6 +465,7 @@ pub mod encoding {
         }
     }
 
+    /// The action to perform after selecting an encoding.
     pub enum Action {
         Save,
         Reopen,

crates/fs/src/encodings.rs 🔗

@@ -1,21 +1,40 @@
+use anyhow::{Error, Result};
+
 use encoding::Encoding;
 
-pub enum CharacterEncoding {
-    Utf8,
-    Iso8859_1,
-    Cp865,
-}
+/// A wrapper around `encoding::Encoding` to implement `Send` and `Sync`.
+/// Since the reference is static, it is safe to send it across threads.
+pub struct EncodingWrapper(&'static dyn Encoding);
+
+unsafe impl Send for EncodingWrapper {}
+unsafe impl Sync for EncodingWrapper {}
 
-pub fn to_utf8<'a>(input: Vec<u8>, encoding: &'a impl encoding::Encoding) -> String {
-    match encoding.decode(&input, encoding::DecoderTrap::Strict) {
-        Ok(v) => return v,
-        Err(_) => panic!(),
+impl EncodingWrapper {
+    pub fn new(encoding: &'static dyn Encoding) -> EncodingWrapper {
+        EncodingWrapper(encoding)
+    }
+
+    pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
+        match self.0.decode(&input, encoding::DecoderTrap::Replace) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(Error::msg(e.to_string())),
+        }
     }
-}
 
-pub fn to<'a>(input: String, target: &'a impl encoding::Encoding) -> Vec<u8> {
-    match target.encode(&input, encoding::EncoderTrap::Strict) {
-        Ok(v) => v,
-        Err(_) => panic!(),
+    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
+        match self.0.encode(&input, encoding::EncoderTrap::Replace) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(Error::msg(e.to_string())),
+        }
     }
 }
+
+/// Convert a byte vector from a specified encoding to a UTF-8 string.
+pub async fn to_utf8<'a>(input: Vec<u8>, encoding: EncodingWrapper) -> Result<String> {
+    Ok(encoding.decode(input).await?)
+}
+
+/// Convert a UTF-8 string to a byte vector in a specified encoding.
+pub async fn from_utf8<'a>(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
+    Ok(target.encode(input).await?)
+}

crates/fs/src/fs.rs 🔗

@@ -1,6 +1,7 @@
 #[cfg(target_os = "macos")]
 mod mac_watcher;
 
+pub mod encodings;
 #[cfg(not(target_os = "macos"))]
 pub mod fs_watcher;
 
@@ -60,6 +61,7 @@ use std::ffi::OsStr;
 
 #[cfg(any(test, feature = "test-support"))]
 pub use fake_git_repo::{LOAD_HEAD_TEXT_TASK, LOAD_INDEX_TEXT_TASK};
+use crate::encodings::EncodingWrapper;
 
 pub trait Watcher: Send + Sync {
     fn add(&self, path: &Path) -> Result<()>;
@@ -115,6 +117,16 @@ pub trait Fs: Send + Sync {
     async fn load(&self, path: &Path) -> Result<String> {
         Ok(String::from_utf8(self.load_bytes(path).await?)?)
     }
+
+    /// Load a file with the specified encoding, returning a UTF-8 string.
+    async fn load_with_encoding(
+        &self,
+        path: PathBuf,
+        encoding: EncodingWrapper,
+    ) -> anyhow::Result<String> {
+        Ok(encodings::to_utf8(self.load_bytes(path.as_path()).await?, encoding).await?)
+    }
+
     async fn load_bytes(&self, path: &Path) -> Result<Vec<u8>>;
     async fn atomic_write(&self, path: PathBuf, text: String) -> Result<()>;
     async fn save(&self, path: &Path, text: &Rope, line_ending: LineEnding) -> Result<()>;
@@ -599,9 +611,12 @@ impl Fs for RealFs {
 
     async fn load(&self, path: &Path) -> Result<String> {
         let path = path.to_path_buf();
-        self.executor
-            .spawn(async move { Ok(std::fs::read_to_string(path)?) })
-            .await
+        let encoding = EncodingWrapper::new(encoding::all::UTF_8);
+        let text =
+            smol::unblock(async || Ok(encodings::to_utf8(std::fs::read(path)?, encoding).await?))
+                .await
+                .await;
+        text
     }
 
     async fn load_bytes(&self, path: &Path) -> Result<Vec<u8>> {

crates/language/src/buffer.rs 🔗

@@ -21,7 +21,8 @@ use anyhow::{Context as _, Result};
 use clock::Lamport;
 pub use clock::ReplicaId;
 use collections::HashMap;
-use fs::MTime;
+use encoding::Encoding;
+use fs::{Fs, MTime, RealFs};
 use futures::channel::oneshot;
 use gpui::{
     App, AppContext as _, BackgroundExecutor, Context, Entity, EventEmitter, HighlightStyle,
@@ -417,6 +418,10 @@ pub trait LocalFile: File {
 
     /// Loads the file's contents from disk.
     fn load_bytes(&self, cx: &App) -> Task<Result<Vec<u8>>>;
+
+    /// Loads the file contents from disk, decoding them with the given encoding.
+    fn load_with_encoding(&self, cx: &App, encoding: &'static dyn Encoding)
+    -> Task<Result<String>>;
 }
 
 /// The auto-indent behavior associated with an editing operation.
@@ -1343,12 +1348,15 @@ impl Buffer {
     /// Reloads the contents of the buffer from disk.
     pub fn reload(&mut self, cx: &Context<Self>) -> oneshot::Receiver<Option<Transaction>> {
         let (tx, rx) = futures::channel::oneshot::channel();
+        let encoding = self.encoding.clone();
         let prev_version = self.text.version();
         self.reload_task = Some(cx.spawn(async move |this, cx| {
             let Some((new_mtime, new_text)) = this.update(cx, |this, cx| {
                 let file = this.file.as_ref()?.as_local()?;
-
-                Some((file.disk_state().mtime(), file.load(cx)))
+                Some((
+                    file.disk_state().mtime(),
+                    file.load_with_encoding(cx, encoding),
+                ))
             })?
             else {
                 return Ok(());
@@ -5229,6 +5237,14 @@ impl LocalFile for TestFile {
     fn load_bytes(&self, _cx: &App) -> Task<Result<Vec<u8>>> {
         unimplemented!()
     }
+
+    fn load_with_encoding(
+        &self,
+        cx: &App,
+        encoding: &'static dyn Encoding,
+    ) -> Task<Result<String>> {
+        unimplemented!()
+    }
 }
 
 pub(crate) fn contiguous_ranges(

crates/worktree/Cargo.toml 🔗

@@ -47,6 +47,8 @@ smol.workspace = true
 sum_tree.workspace = true
 text.workspace = true
 util.workspace = true
+encoding = "0.2.33"
+
 
 [dev-dependencies]
 clock = { workspace = true, features = ["test-support"] }

crates/worktree/src/worktree.rs 🔗

@@ -7,7 +7,11 @@ use ::ignore::gitignore::{Gitignore, GitignoreBuilder};
 use anyhow::{Context as _, Result, anyhow};
 use clock::ReplicaId;
 use collections::{HashMap, HashSet, VecDeque};
-use fs::{Fs, MTime, PathEvent, RemoveOptions, Watcher, copy_recursive, read_dir_items};
+use encoding::Encoding;
+use fs::{
+    Fs, MTime, PathEvent, RemoveOptions, Watcher, copy_recursive, encodings::EncodingWrapper,
+    read_dir_items,
+};
 use futures::{
     FutureExt as _, Stream, StreamExt,
     channel::{
@@ -3117,6 +3121,19 @@ impl language::LocalFile for File {
         let fs = worktree.fs.clone();
         cx.background_spawn(async move { fs.load_bytes(&abs_path).await })
     }
+
+    fn load_with_encoding(
+        &self,
+        cx: &App,
+        encoding: &'static dyn Encoding,
+    ) -> Task<Result<String>> {
+        let worktree = self.worktree.read(cx).as_local().unwrap();
+        let path = worktree.absolutize(&self.path);
+        let fs = worktree.fs.clone();
+
+        let encoding = EncodingWrapper::new(encoding);
+        cx.background_spawn(async move { fs.load_with_encoding(path?, encoding).await })
+    }
 }
 
 impl File {