Add UTF-16 LE/BE support and remove unused serde code

R Aadarsh created

- Add UTF-16 LE and UTF-16 BE to encoding lists and mappings

- Implement encode/decode for UTF-16LE/BE in `EncodingWrapper`

- Remove unused serde deserialisation code and tests

Change summary

crates/encodings/src/lib.rs       |  64 ++++++++-------
crates/encodings/src/selectors.rs |  64 +++++++--------
crates/fs/src/encodings.rs        | 133 ++++++++------------------------
3 files changed, 100 insertions(+), 161 deletions(-)

Detailed changes

crates/encodings/src/lib.rs πŸ”—

@@ -104,6 +104,8 @@ pub fn encoding_name(encoding: &'static Encoding) -> String {
 
     match name {
         "UTF-8" => "UTF-8",
+        "UTF-16LE" => "UTF-16 LE",
+        "UTF-16BE" => "UTF-16 BE",
         "windows-1252" => "Windows-1252",
         "windows-1251" => "Windows-1251",
         "windows-1250" => "Windows-1250",
@@ -143,35 +145,37 @@ pub fn encoding_name(encoding: &'static Encoding) -> String {
 pub fn encoding_from_index(index: usize) -> &'static Encoding {
     match index {
         0 => encoding_rs::UTF_8,
-        1 => encoding_rs::WINDOWS_1252,
-        2 => encoding_rs::WINDOWS_1251,
-        3 => encoding_rs::WINDOWS_1250,
-        4 => encoding_rs::ISO_8859_2,
-        5 => encoding_rs::ISO_8859_3,
-        6 => encoding_rs::ISO_8859_4,
-        7 => encoding_rs::ISO_8859_5,
-        8 => encoding_rs::ISO_8859_6,
-        9 => encoding_rs::ISO_8859_7,
-        10 => encoding_rs::ISO_8859_8,
-        11 => encoding_rs::ISO_8859_13,
-        12 => encoding_rs::ISO_8859_15,
-        13 => encoding_rs::KOI8_R,
-        14 => encoding_rs::KOI8_U,
-        15 => encoding_rs::MACINTOSH,
-        16 => encoding_rs::X_MAC_CYRILLIC,
-        17 => encoding_rs::WINDOWS_874,
-        18 => encoding_rs::WINDOWS_1253,
-        19 => encoding_rs::WINDOWS_1254,
-        20 => encoding_rs::WINDOWS_1255,
-        21 => encoding_rs::WINDOWS_1256,
-        22 => encoding_rs::WINDOWS_1257,
-        23 => encoding_rs::WINDOWS_1258,
-        24 => encoding_rs::EUC_KR,
-        25 => encoding_rs::EUC_JP,
-        26 => encoding_rs::ISO_2022_JP,
-        27 => encoding_rs::GBK,
-        28 => encoding_rs::GB18030,
-        29 => encoding_rs::BIG5,
+        1 => encoding_rs::UTF_16LE,
+        2 => encoding_rs::UTF_16BE,
+        3 => encoding_rs::WINDOWS_1252,
+        4 => encoding_rs::WINDOWS_1251,
+        5 => encoding_rs::WINDOWS_1250,
+        6 => encoding_rs::ISO_8859_2,
+        7 => encoding_rs::ISO_8859_3,
+        8 => encoding_rs::ISO_8859_4,
+        9 => encoding_rs::ISO_8859_5,
+        10 => encoding_rs::ISO_8859_6,
+        11 => encoding_rs::ISO_8859_7,
+        12 => encoding_rs::ISO_8859_8,
+        13 => encoding_rs::ISO_8859_13,
+        14 => encoding_rs::ISO_8859_15,
+        15 => encoding_rs::KOI8_R,
+        16 => encoding_rs::KOI8_U,
+        17 => encoding_rs::MACINTOSH,
+        18 => encoding_rs::X_MAC_CYRILLIC,
+        19 => encoding_rs::WINDOWS_874,
+        20 => encoding_rs::WINDOWS_1253,
+        21 => encoding_rs::WINDOWS_1254,
+        22 => encoding_rs::WINDOWS_1255,
+        23 => encoding_rs::WINDOWS_1256,
+        24 => encoding_rs::WINDOWS_1257,
+        25 => encoding_rs::WINDOWS_1258,
+        26 => encoding_rs::EUC_KR,
+        27 => encoding_rs::EUC_JP,
+        28 => encoding_rs::ISO_2022_JP,
+        29 => encoding_rs::GBK,
+        30 => encoding_rs::GB18030,
+        31 => encoding_rs::BIG5,
         _ => encoding_rs::UTF_8,
     }
 }
@@ -180,6 +184,8 @@ pub fn encoding_from_index(index: usize) -> &'static Encoding {
 pub fn encoding_from_name(name: &str) -> &'static Encoding {
     match name {
         "UTF-8" => encoding_rs::UTF_8,
+        "UTF-16 LE" => encoding_rs::UTF_16LE,
+        "UTF-16 BE" => encoding_rs::UTF_16BE,
         "Windows-1252" => encoding_rs::WINDOWS_1252,
         "Windows-1251" => encoding_rs::WINDOWS_1251,
         "Windows-1250" => encoding_rs::WINDOWS_1250,

crates/encodings/src/selectors.rs πŸ”—

@@ -271,10 +271,6 @@ pub mod save_or_reopen {
             )
         }
     }
-
-    pub fn get_current_encoding() -> &'static str {
-        "UTF-8"
-    }
 }
 
 /// This module contains the encoding selector for choosing an encoding to save or reopen a file with.
@@ -319,35 +315,37 @@ pub mod encoding {
                 current_selection: 0,
                 encodings: vec![
                     StringMatchCandidate::new(0, "UTF-8"),
-                    StringMatchCandidate::new(1, "Windows-1252"),
-                    StringMatchCandidate::new(2, "Windows-1251"),
-                    StringMatchCandidate::new(3, "Windows-1250"),
-                    StringMatchCandidate::new(4, "ISO 8859-2"),
-                    StringMatchCandidate::new(5, "ISO 8859-3"),
-                    StringMatchCandidate::new(6, "ISO 8859-4"),
-                    StringMatchCandidate::new(7, "ISO 8859-5"),
-                    StringMatchCandidate::new(8, "ISO 8859-6"),
-                    StringMatchCandidate::new(9, "ISO 8859-7"),
-                    StringMatchCandidate::new(10, "ISO 8859-8"),
-                    StringMatchCandidate::new(11, "ISO 8859-13"),
-                    StringMatchCandidate::new(12, "ISO 8859-15"),
-                    StringMatchCandidate::new(13, "KOI8-R"),
-                    StringMatchCandidate::new(14, "KOI8-U"),
-                    StringMatchCandidate::new(15, "MacRoman"),
-                    StringMatchCandidate::new(16, "Mac Cyrillic"),
-                    StringMatchCandidate::new(17, "Windows-874"),
-                    StringMatchCandidate::new(18, "Windows-1253"),
-                    StringMatchCandidate::new(19, "Windows-1254"),
-                    StringMatchCandidate::new(20, "Windows-1255"),
-                    StringMatchCandidate::new(21, "Windows-1256"),
-                    StringMatchCandidate::new(22, "Windows-1257"),
-                    StringMatchCandidate::new(23, "Windows-1258"),
-                    StringMatchCandidate::new(24, "Windows-949"),
-                    StringMatchCandidate::new(25, "EUC-JP"),
-                    StringMatchCandidate::new(26, "ISO 2022-JP"),
-                    StringMatchCandidate::new(27, "GBK"),
-                    StringMatchCandidate::new(28, "GB18030"),
-                    StringMatchCandidate::new(29, "Big5"),
+                    StringMatchCandidate::new(1, "UTF-16 LE"),
+                    StringMatchCandidate::new(2, "UTF-16 BE"),
+                    StringMatchCandidate::new(3, "Windows-1252"),
+                    StringMatchCandidate::new(4, "Windows-1251"),
+                    StringMatchCandidate::new(5, "Windows-1250"),
+                    StringMatchCandidate::new(6, "ISO 8859-2"),
+                    StringMatchCandidate::new(7, "ISO 8859-3"),
+                    StringMatchCandidate::new(8, "ISO 8859-4"),
+                    StringMatchCandidate::new(9, "ISO 8859-5"),
+                    StringMatchCandidate::new(10, "ISO 8859-6"),
+                    StringMatchCandidate::new(11, "ISO 8859-7"),
+                    StringMatchCandidate::new(12, "ISO 8859-8"),
+                    StringMatchCandidate::new(13, "ISO 8859-13"),
+                    StringMatchCandidate::new(14, "ISO 8859-15"),
+                    StringMatchCandidate::new(15, "KOI8-R"),
+                    StringMatchCandidate::new(16, "KOI8-U"),
+                    StringMatchCandidate::new(17, "MacRoman"),
+                    StringMatchCandidate::new(18, "Mac Cyrillic"),
+                    StringMatchCandidate::new(19, "Windows-874"),
+                    StringMatchCandidate::new(20, "Windows-1253"),
+                    StringMatchCandidate::new(21, "Windows-1254"),
+                    StringMatchCandidate::new(22, "Windows-1255"),
+                    StringMatchCandidate::new(23, "Windows-1256"),
+                    StringMatchCandidate::new(24, "Windows-1257"),
+                    StringMatchCandidate::new(25, "Windows-1258"),
+                    StringMatchCandidate::new(26, "Windows-949"),
+                    StringMatchCandidate::new(27, "EUC-JP"),
+                    StringMatchCandidate::new(28, "ISO 2022-JP"),
+                    StringMatchCandidate::new(29, "GBK"),
+                    StringMatchCandidate::new(30, "GB18030"),
+                    StringMatchCandidate::new(31, "Big5"),
                 ],
                 matches: Vec::new(),
                 selector,

crates/fs/src/encodings.rs πŸ”—

@@ -3,7 +3,6 @@ use std::fmt::Debug;
 
 use anyhow::Result;
 use encoding_rs::Encoding;
-use serde::{Deserialize, de::Visitor};
 
 /// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 /// Since the reference is static, it is safe to send it across threads.
@@ -11,7 +10,7 @@ pub struct EncodingWrapper(&'static Encoding);
 
 impl Debug for EncodingWrapper {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_tuple("EncodingWrapper")
+        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
             .field(&self.0.name())
             .finish()
     }
@@ -19,37 +18,6 @@ impl Debug for EncodingWrapper {
 
 pub struct EncodingWrapperVisitor;
 
-impl<'vi> Visitor<'vi> for EncodingWrapperVisitor {
-    type Value = EncodingWrapper;
-
-    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-        formatter.write_str("a valid encoding name")
-    }
-
-    fn visit_str<E: serde::de::Error>(self, encoding: &str) -> Result<EncodingWrapper, E> {
-        Ok(EncodingWrapper(
-            Encoding::for_label(encoding.as_bytes())
-                .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?,
-        ))
-    }
-
-    fn visit_string<E: serde::de::Error>(self, encoding: String) -> Result<EncodingWrapper, E> {
-        Ok(EncodingWrapper(
-            Encoding::for_label(encoding.as_bytes())
-                .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?,
-        ))
-    }
-}
-
-impl<'de> Deserialize<'de> for EncodingWrapper {
-    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_str(EncodingWrapperVisitor)
-    }
-}
-
 impl PartialEq for EncodingWrapper {
     fn eq(&self, other: &Self) -> bool {
         self.0.name() == other.0.name()
@@ -71,19 +39,44 @@ impl EncodingWrapper {
     }
 
     pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
-        let (cow, _encoding_used, _had_errors) = self.0.decode(&input);
-        // encoding_rs handles invalid bytes by replacing them with replacement characters
+        let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
+        // `encoding_rs` handles invalid bytes by replacing them with replacement characters
         // in the output string, so we return the result even if there were errors.
-        // This preserves the original behavior where files with invalid bytes could still be opened.
+        // This preserves the original behaviour where files with invalid bytes could still be opened.
         Ok(cow.into_owned())
     }
 
     pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
-        let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
-        // encoding_rs handles unencodable characters by replacing them with
-        // appropriate substitutes in the output, so we return the result even if there were errors.
-        // This maintains consistency with the decode behavior.
-        Ok(cow.into_owned())
+        if self.0 == encoding_rs::UTF_16BE {
+            let mut data: Vec<u8> = vec![];
+            let utf = input.encode_utf16().collect::<Vec<u16>>();
+
+            for i in utf {
+                let byte = i.to_be_bytes();
+                for b in byte {
+                    data.push(b);
+                }
+            }
+            return Ok(data);
+        } else if self.0 == encoding_rs::UTF_16LE {
+            let mut data: Vec<u8> = vec![];
+            let utf = input.encode_utf16().collect::<Vec<u16>>();
+
+            for i in utf {
+                let byte = i.to_le_bytes();
+                for b in byte {
+                    data.push(b);
+                }
+            }
+            return Ok(data);
+        } else {
+            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
+            println!("Encoding: {:?}", self);
+            // `encoding_rs` handles unencodable characters by replacing them with
+            // appropriate substitutes in the output, so we return the result even if there were errors.
+            // This maintains consistency with the decode behaviour.
+            Ok(cow.into_owned())
+        }
     }
 }
 
@@ -96,61 +89,3 @@ pub async fn to_utf8(input: Vec<u8>, encoding: EncodingWrapper) -> Result<String
 pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
     target.encode(input).await
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use gpui::BackgroundExecutor;
-
-    #[gpui::test]
-    async fn test_decode_with_invalid_bytes(_: BackgroundExecutor) {
-        // Test that files with invalid bytes can still be decoded
-        // This is a regression test for the issue where files couldn't be opened
-        // when they contained invalid bytes for the specified encoding
-
-        // Create some invalid UTF-8 bytes
-        let invalid_bytes = vec![0xFF, 0xFE, 0x00, 0x48]; // Invalid UTF-8 sequence
-
-        let encoding = EncodingWrapper::new(encoding_rs::UTF_8);
-        let result = encoding.decode(invalid_bytes).await;
-
-        // The decode should succeed, not fail
-        assert!(
-            result.is_ok(),
-            "Decode should succeed even with invalid bytes"
-        );
-
-        let decoded = result.unwrap();
-        // The result should contain replacement characters for invalid sequences
-        assert!(!decoded.is_empty(), "Decoded string should not be empty");
-
-        // Test with Windows-1252 and some bytes that might be invalid
-        let maybe_invalid_bytes = vec![0x81, 0x8D, 0x8F, 0x90, 0x9D]; // Some potentially problematic bytes
-        let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252);
-        let result = encoding.decode(maybe_invalid_bytes).await;
-
-        // Should still succeed
-        assert!(
-            result.is_ok(),
-            "Decode should succeed with Windows-1252 even with potentially invalid bytes"
-        );
-    }
-
-    #[gpui::test]
-    async fn test_encode_with_unencodable_chars(_: BackgroundExecutor) {
-        // Test that strings with unencodable characters can still be encoded
-        let input = "Hello δΈ–η•Œ 🌍".to_string(); // Contains Unicode that may not encode to all formats
-
-        let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252);
-        let result = encoding.encode(input).await;
-
-        // The encode should succeed, not fail
-        assert!(
-            result.is_ok(),
-            "Encode should succeed even with unencodable characters"
-        );
-
-        let encoded = result.unwrap();
-        assert!(!encoded.is_empty(), "Encoded bytes should not be empty");
-    }
-}