diff --git a/crates/encodings/src/lib.rs b/crates/encodings/src/lib.rs index cda2fdcb4cf46a83af2ff61c093a8fe4b80b01d3..c7072e7552368e4983ca9d87c53ac4c565ffc760 100644 --- a/crates/encodings/src/lib.rs +++ b/crates/encodings/src/lib.rs @@ -104,6 +104,8 @@ pub fn encoding_name(encoding: &'static Encoding) -> String { match name { "UTF-8" => "UTF-8", + "UTF-16LE" => "UTF-16 LE", + "UTF-16BE" => "UTF-16 BE", "windows-1252" => "Windows-1252", "windows-1251" => "Windows-1251", "windows-1250" => "Windows-1250", @@ -143,35 +145,37 @@ pub fn encoding_name(encoding: &'static Encoding) -> String { pub fn encoding_from_index(index: usize) -> &'static Encoding { match index { 0 => encoding_rs::UTF_8, - 1 => encoding_rs::WINDOWS_1252, - 2 => encoding_rs::WINDOWS_1251, - 3 => encoding_rs::WINDOWS_1250, - 4 => encoding_rs::ISO_8859_2, - 5 => encoding_rs::ISO_8859_3, - 6 => encoding_rs::ISO_8859_4, - 7 => encoding_rs::ISO_8859_5, - 8 => encoding_rs::ISO_8859_6, - 9 => encoding_rs::ISO_8859_7, - 10 => encoding_rs::ISO_8859_8, - 11 => encoding_rs::ISO_8859_13, - 12 => encoding_rs::ISO_8859_15, - 13 => encoding_rs::KOI8_R, - 14 => encoding_rs::KOI8_U, - 15 => encoding_rs::MACINTOSH, - 16 => encoding_rs::X_MAC_CYRILLIC, - 17 => encoding_rs::WINDOWS_874, - 18 => encoding_rs::WINDOWS_1253, - 19 => encoding_rs::WINDOWS_1254, - 20 => encoding_rs::WINDOWS_1255, - 21 => encoding_rs::WINDOWS_1256, - 22 => encoding_rs::WINDOWS_1257, - 23 => encoding_rs::WINDOWS_1258, - 24 => encoding_rs::EUC_KR, - 25 => encoding_rs::EUC_JP, - 26 => encoding_rs::ISO_2022_JP, - 27 => encoding_rs::GBK, - 28 => encoding_rs::GB18030, - 29 => encoding_rs::BIG5, + 1 => encoding_rs::UTF_16LE, + 2 => encoding_rs::UTF_16BE, + 3 => encoding_rs::WINDOWS_1252, + 4 => encoding_rs::WINDOWS_1251, + 5 => encoding_rs::WINDOWS_1250, + 6 => encoding_rs::ISO_8859_2, + 7 => encoding_rs::ISO_8859_3, + 8 => encoding_rs::ISO_8859_4, + 9 => encoding_rs::ISO_8859_5, + 10 => encoding_rs::ISO_8859_6, + 11 => encoding_rs::ISO_8859_7, + 12 => encoding_rs::ISO_8859_8, + 13 => encoding_rs::ISO_8859_13, + 14 => encoding_rs::ISO_8859_15, + 15 => encoding_rs::KOI8_R, + 16 => encoding_rs::KOI8_U, + 17 => encoding_rs::MACINTOSH, + 18 => encoding_rs::X_MAC_CYRILLIC, + 19 => encoding_rs::WINDOWS_874, + 20 => encoding_rs::WINDOWS_1253, + 21 => encoding_rs::WINDOWS_1254, + 22 => encoding_rs::WINDOWS_1255, + 23 => encoding_rs::WINDOWS_1256, + 24 => encoding_rs::WINDOWS_1257, + 25 => encoding_rs::WINDOWS_1258, + 26 => encoding_rs::EUC_KR, + 27 => encoding_rs::EUC_JP, + 28 => encoding_rs::ISO_2022_JP, + 29 => encoding_rs::GBK, + 30 => encoding_rs::GB18030, + 31 => encoding_rs::BIG5, _ => encoding_rs::UTF_8, } } @@ -180,6 +184,8 @@ pub fn encoding_from_index(index: usize) -> &'static Encoding { pub fn encoding_from_name(name: &str) -> &'static Encoding { match name { "UTF-8" => encoding_rs::UTF_8, + "UTF-16 LE" => encoding_rs::UTF_16LE, + "UTF-16 BE" => encoding_rs::UTF_16BE, "Windows-1252" => encoding_rs::WINDOWS_1252, "Windows-1251" => encoding_rs::WINDOWS_1251, "Windows-1250" => encoding_rs::WINDOWS_1250, diff --git a/crates/encodings/src/selectors.rs b/crates/encodings/src/selectors.rs index 30e5dd1cd52beda36a74c77bb3839b2d49e7a1b5..75c1045059dcd26c7e1d8054e6cf5c4874c1cbfd 100644 --- a/crates/encodings/src/selectors.rs +++ b/crates/encodings/src/selectors.rs @@ -271,10 +271,6 @@ pub mod save_or_reopen { ) } } - - pub fn get_current_encoding() -> &'static str { - "UTF-8" - } } /// This module contains the encoding selector for choosing an encoding to save or reopen a file with. @@ -319,35 +315,37 @@ pub mod encoding { current_selection: 0, encodings: vec![ StringMatchCandidate::new(0, "UTF-8"), - StringMatchCandidate::new(1, "Windows-1252"), - StringMatchCandidate::new(2, "Windows-1251"), - StringMatchCandidate::new(3, "Windows-1250"), - StringMatchCandidate::new(4, "ISO 8859-2"), - StringMatchCandidate::new(5, "ISO 8859-3"), - StringMatchCandidate::new(6, "ISO 8859-4"), - StringMatchCandidate::new(7, "ISO 8859-5"), - StringMatchCandidate::new(8, "ISO 8859-6"), - StringMatchCandidate::new(9, "ISO 8859-7"), - StringMatchCandidate::new(10, "ISO 8859-8"), - StringMatchCandidate::new(11, "ISO 8859-13"), - StringMatchCandidate::new(12, "ISO 8859-15"), - StringMatchCandidate::new(13, "KOI8-R"), - StringMatchCandidate::new(14, "KOI8-U"), - StringMatchCandidate::new(15, "MacRoman"), - StringMatchCandidate::new(16, "Mac Cyrillic"), - StringMatchCandidate::new(17, "Windows-874"), - StringMatchCandidate::new(18, "Windows-1253"), - StringMatchCandidate::new(19, "Windows-1254"), - StringMatchCandidate::new(20, "Windows-1255"), - StringMatchCandidate::new(21, "Windows-1256"), - StringMatchCandidate::new(22, "Windows-1257"), - StringMatchCandidate::new(23, "Windows-1258"), - StringMatchCandidate::new(24, "Windows-949"), - StringMatchCandidate::new(25, "EUC-JP"), - StringMatchCandidate::new(26, "ISO 2022-JP"), - StringMatchCandidate::new(27, "GBK"), - StringMatchCandidate::new(28, "GB18030"), - StringMatchCandidate::new(29, "Big5"), + StringMatchCandidate::new(1, "UTF-16 LE"), + StringMatchCandidate::new(2, "UTF-16 BE"), + StringMatchCandidate::new(3, "Windows-1252"), + StringMatchCandidate::new(4, "Windows-1251"), + StringMatchCandidate::new(5, "Windows-1250"), + StringMatchCandidate::new(6, "ISO 8859-2"), + StringMatchCandidate::new(7, "ISO 8859-3"), + StringMatchCandidate::new(8, "ISO 8859-4"), + StringMatchCandidate::new(9, "ISO 8859-5"), + StringMatchCandidate::new(10, "ISO 8859-6"), + StringMatchCandidate::new(11, "ISO 8859-7"), + StringMatchCandidate::new(12, "ISO 8859-8"), + StringMatchCandidate::new(13, "ISO 8859-13"), + StringMatchCandidate::new(14, "ISO 8859-15"), + StringMatchCandidate::new(15, "KOI8-R"), + StringMatchCandidate::new(16, "KOI8-U"), + StringMatchCandidate::new(17, "MacRoman"), + StringMatchCandidate::new(18, "Mac Cyrillic"), + StringMatchCandidate::new(19, "Windows-874"), + StringMatchCandidate::new(20, "Windows-1253"), + StringMatchCandidate::new(21, "Windows-1254"), + StringMatchCandidate::new(22, "Windows-1255"), + StringMatchCandidate::new(23, "Windows-1256"), + StringMatchCandidate::new(24, "Windows-1257"), + StringMatchCandidate::new(25, "Windows-1258"), + StringMatchCandidate::new(26, "Windows-949"), + StringMatchCandidate::new(27, "EUC-JP"), + StringMatchCandidate::new(28, "ISO 2022-JP"), + StringMatchCandidate::new(29, "GBK"), + StringMatchCandidate::new(30, "GB18030"), + StringMatchCandidate::new(31, "Big5"), ], matches: Vec::new(), selector, diff --git a/crates/fs/src/encodings.rs b/crates/fs/src/encodings.rs index d9a5a93828a0eca6d9e8dd7318d6f63b00ee2b97..0a37c947c3e1e0be339888e68d49a9af64746b01 100644 --- a/crates/fs/src/encodings.rs +++ b/crates/fs/src/encodings.rs @@ -3,7 +3,6 @@ use std::fmt::Debug; use anyhow::Result; use encoding_rs::Encoding; -use serde::{Deserialize, de::Visitor}; /// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`. /// Since the reference is static, it is safe to send it across threads. @@ -11,7 +10,7 @@ pub struct EncodingWrapper(&'static Encoding); impl Debug for EncodingWrapper { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_tuple("EncodingWrapper") + f.debug_tuple(&format!("EncodingWrapper{:?}", self.0)) .field(&self.0.name()) .finish() } @@ -19,37 +18,6 @@ impl Debug for EncodingWrapper { pub struct EncodingWrapperVisitor; -impl<'vi> Visitor<'vi> for EncodingWrapperVisitor { - type Value = EncodingWrapper; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - formatter.write_str("a valid encoding name") - } - - fn visit_str(self, encoding: &str) -> Result { - Ok(EncodingWrapper( - Encoding::for_label(encoding.as_bytes()) - .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?, - )) - } - - fn visit_string(self, encoding: String) -> Result { - Ok(EncodingWrapper( - Encoding::for_label(encoding.as_bytes()) - .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?, - )) - } -} - -impl<'de> Deserialize<'de> for EncodingWrapper { - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_str(EncodingWrapperVisitor) - } -} - impl PartialEq for EncodingWrapper { fn eq(&self, other: &Self) -> bool { self.0.name() == other.0.name() @@ -71,19 +39,44 @@ impl EncodingWrapper { } pub async fn decode(&self, input: Vec) -> Result { - let (cow, _encoding_used, _had_errors) = self.0.decode(&input); - // encoding_rs handles invalid bytes by replacing them with replacement characters + let (cow, _had_errors) = self.0.decode_with_bom_removal(&input); + // `encoding_rs` handles invalid bytes by replacing them with replacement characters // in the output string, so we return the result even if there were errors. - // This preserves the original behavior where files with invalid bytes could still be opened. + // This preserves the original behaviour where files with invalid bytes could still be opened. Ok(cow.into_owned()) } pub async fn encode(&self, input: String) -> Result> { - let (cow, _encoding_used, _had_errors) = self.0.encode(&input); - // encoding_rs handles unencodable characters by replacing them with - // appropriate substitutes in the output, so we return the result even if there were errors. - // This maintains consistency with the decode behavior. - Ok(cow.into_owned()) + if self.0 == encoding_rs::UTF_16BE { + let mut data: Vec = vec![]; + let utf = input.encode_utf16().collect::>(); + + for i in utf { + let byte = i.to_be_bytes(); + for b in byte { + data.push(b); + } + } + return Ok(data); + } else if self.0 == encoding_rs::UTF_16LE { + let mut data: Vec = vec![]; + let utf = input.encode_utf16().collect::>(); + + for i in utf { + let byte = i.to_le_bytes(); + for b in byte { + data.push(b); + } + } + return Ok(data); + } else { + let (cow, _encoding_used, _had_errors) = self.0.encode(&input); + println!("Encoding: {:?}", self); + // `encoding_rs` handles unencodable characters by replacing them with + // appropriate substitutes in the output, so we return the result even if there were errors. + // This maintains consistency with the decode behaviour. + Ok(cow.into_owned()) + } } } @@ -96,61 +89,3 @@ pub async fn to_utf8(input: Vec, encoding: EncodingWrapper) -> Result Result> { target.encode(input).await } - -#[cfg(test)] -mod tests { - use super::*; - use gpui::BackgroundExecutor; - - #[gpui::test] - async fn test_decode_with_invalid_bytes(_: BackgroundExecutor) { - // Test that files with invalid bytes can still be decoded - // This is a regression test for the issue where files couldn't be opened - // when they contained invalid bytes for the specified encoding - - // Create some invalid UTF-8 bytes - let invalid_bytes = vec![0xFF, 0xFE, 0x00, 0x48]; // Invalid UTF-8 sequence - - let encoding = EncodingWrapper::new(encoding_rs::UTF_8); - let result = encoding.decode(invalid_bytes).await; - - // The decode should succeed, not fail - assert!( - result.is_ok(), - "Decode should succeed even with invalid bytes" - ); - - let decoded = result.unwrap(); - // The result should contain replacement characters for invalid sequences - assert!(!decoded.is_empty(), "Decoded string should not be empty"); - - // Test with Windows-1252 and some bytes that might be invalid - let maybe_invalid_bytes = vec![0x81, 0x8D, 0x8F, 0x90, 0x9D]; // Some potentially problematic bytes - let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252); - let result = encoding.decode(maybe_invalid_bytes).await; - - // Should still succeed - assert!( - result.is_ok(), - "Decode should succeed with Windows-1252 even with potentially invalid bytes" - ); - } - - #[gpui::test] - async fn test_encode_with_unencodable_chars(_: BackgroundExecutor) { - // Test that strings with unencodable characters can still be encoded - let input = "Hello δΈ–η•Œ 🌍".to_string(); // Contains Unicode that may not encode to all formats - - let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252); - let result = encoding.encode(input).await; - - // The encode should succeed, not fail - assert!( - result.is_ok(), - "Encode should succeed even with unencodable characters" - ); - - let encoded = result.unwrap(); - assert!(!encoded.is_empty(), "Encoded bytes should not be empty"); - } -}