@@ -104,6 +104,8 @@ pub fn encoding_name(encoding: &'static Encoding) -> String {
match name {
"UTF-8" => "UTF-8",
+ "UTF-16LE" => "UTF-16 LE",
+ "UTF-16BE" => "UTF-16 BE",
"windows-1252" => "Windows-1252",
"windows-1251" => "Windows-1251",
"windows-1250" => "Windows-1250",
@@ -143,35 +145,37 @@ pub fn encoding_name(encoding: &'static Encoding) -> String {
pub fn encoding_from_index(index: usize) -> &'static Encoding {
match index {
0 => encoding_rs::UTF_8,
- 1 => encoding_rs::WINDOWS_1252,
- 2 => encoding_rs::WINDOWS_1251,
- 3 => encoding_rs::WINDOWS_1250,
- 4 => encoding_rs::ISO_8859_2,
- 5 => encoding_rs::ISO_8859_3,
- 6 => encoding_rs::ISO_8859_4,
- 7 => encoding_rs::ISO_8859_5,
- 8 => encoding_rs::ISO_8859_6,
- 9 => encoding_rs::ISO_8859_7,
- 10 => encoding_rs::ISO_8859_8,
- 11 => encoding_rs::ISO_8859_13,
- 12 => encoding_rs::ISO_8859_15,
- 13 => encoding_rs::KOI8_R,
- 14 => encoding_rs::KOI8_U,
- 15 => encoding_rs::MACINTOSH,
- 16 => encoding_rs::X_MAC_CYRILLIC,
- 17 => encoding_rs::WINDOWS_874,
- 18 => encoding_rs::WINDOWS_1253,
- 19 => encoding_rs::WINDOWS_1254,
- 20 => encoding_rs::WINDOWS_1255,
- 21 => encoding_rs::WINDOWS_1256,
- 22 => encoding_rs::WINDOWS_1257,
- 23 => encoding_rs::WINDOWS_1258,
- 24 => encoding_rs::EUC_KR,
- 25 => encoding_rs::EUC_JP,
- 26 => encoding_rs::ISO_2022_JP,
- 27 => encoding_rs::GBK,
- 28 => encoding_rs::GB18030,
- 29 => encoding_rs::BIG5,
+ 1 => encoding_rs::UTF_16LE,
+ 2 => encoding_rs::UTF_16BE,
+ 3 => encoding_rs::WINDOWS_1252,
+ 4 => encoding_rs::WINDOWS_1251,
+ 5 => encoding_rs::WINDOWS_1250,
+ 6 => encoding_rs::ISO_8859_2,
+ 7 => encoding_rs::ISO_8859_3,
+ 8 => encoding_rs::ISO_8859_4,
+ 9 => encoding_rs::ISO_8859_5,
+ 10 => encoding_rs::ISO_8859_6,
+ 11 => encoding_rs::ISO_8859_7,
+ 12 => encoding_rs::ISO_8859_8,
+ 13 => encoding_rs::ISO_8859_13,
+ 14 => encoding_rs::ISO_8859_15,
+ 15 => encoding_rs::KOI8_R,
+ 16 => encoding_rs::KOI8_U,
+ 17 => encoding_rs::MACINTOSH,
+ 18 => encoding_rs::X_MAC_CYRILLIC,
+ 19 => encoding_rs::WINDOWS_874,
+ 20 => encoding_rs::WINDOWS_1253,
+ 21 => encoding_rs::WINDOWS_1254,
+ 22 => encoding_rs::WINDOWS_1255,
+ 23 => encoding_rs::WINDOWS_1256,
+ 24 => encoding_rs::WINDOWS_1257,
+ 25 => encoding_rs::WINDOWS_1258,
+ 26 => encoding_rs::EUC_KR,
+ 27 => encoding_rs::EUC_JP,
+ 28 => encoding_rs::ISO_2022_JP,
+ 29 => encoding_rs::GBK,
+ 30 => encoding_rs::GB18030,
+ 31 => encoding_rs::BIG5,
_ => encoding_rs::UTF_8,
}
}
@@ -180,6 +184,8 @@ pub fn encoding_from_index(index: usize) -> &'static Encoding {
pub fn encoding_from_name(name: &str) -> &'static Encoding {
match name {
"UTF-8" => encoding_rs::UTF_8,
+ "UTF-16 LE" => encoding_rs::UTF_16LE,
+ "UTF-16 BE" => encoding_rs::UTF_16BE,
"Windows-1252" => encoding_rs::WINDOWS_1252,
"Windows-1251" => encoding_rs::WINDOWS_1251,
"Windows-1250" => encoding_rs::WINDOWS_1250,
@@ -271,10 +271,6 @@ pub mod save_or_reopen {
)
}
}
-
- pub fn get_current_encoding() -> &'static str {
- "UTF-8"
- }
}
/// This module contains the encoding selector for choosing an encoding to save or reopen a file with.
@@ -319,35 +315,37 @@ pub mod encoding {
current_selection: 0,
encodings: vec![
StringMatchCandidate::new(0, "UTF-8"),
- StringMatchCandidate::new(1, "Windows-1252"),
- StringMatchCandidate::new(2, "Windows-1251"),
- StringMatchCandidate::new(3, "Windows-1250"),
- StringMatchCandidate::new(4, "ISO 8859-2"),
- StringMatchCandidate::new(5, "ISO 8859-3"),
- StringMatchCandidate::new(6, "ISO 8859-4"),
- StringMatchCandidate::new(7, "ISO 8859-5"),
- StringMatchCandidate::new(8, "ISO 8859-6"),
- StringMatchCandidate::new(9, "ISO 8859-7"),
- StringMatchCandidate::new(10, "ISO 8859-8"),
- StringMatchCandidate::new(11, "ISO 8859-13"),
- StringMatchCandidate::new(12, "ISO 8859-15"),
- StringMatchCandidate::new(13, "KOI8-R"),
- StringMatchCandidate::new(14, "KOI8-U"),
- StringMatchCandidate::new(15, "MacRoman"),
- StringMatchCandidate::new(16, "Mac Cyrillic"),
- StringMatchCandidate::new(17, "Windows-874"),
- StringMatchCandidate::new(18, "Windows-1253"),
- StringMatchCandidate::new(19, "Windows-1254"),
- StringMatchCandidate::new(20, "Windows-1255"),
- StringMatchCandidate::new(21, "Windows-1256"),
- StringMatchCandidate::new(22, "Windows-1257"),
- StringMatchCandidate::new(23, "Windows-1258"),
- StringMatchCandidate::new(24, "Windows-949"),
- StringMatchCandidate::new(25, "EUC-JP"),
- StringMatchCandidate::new(26, "ISO 2022-JP"),
- StringMatchCandidate::new(27, "GBK"),
- StringMatchCandidate::new(28, "GB18030"),
- StringMatchCandidate::new(29, "Big5"),
+ StringMatchCandidate::new(1, "UTF-16 LE"),
+ StringMatchCandidate::new(2, "UTF-16 BE"),
+ StringMatchCandidate::new(3, "Windows-1252"),
+ StringMatchCandidate::new(4, "Windows-1251"),
+ StringMatchCandidate::new(5, "Windows-1250"),
+ StringMatchCandidate::new(6, "ISO 8859-2"),
+ StringMatchCandidate::new(7, "ISO 8859-3"),
+ StringMatchCandidate::new(8, "ISO 8859-4"),
+ StringMatchCandidate::new(9, "ISO 8859-5"),
+ StringMatchCandidate::new(10, "ISO 8859-6"),
+ StringMatchCandidate::new(11, "ISO 8859-7"),
+ StringMatchCandidate::new(12, "ISO 8859-8"),
+ StringMatchCandidate::new(13, "ISO 8859-13"),
+ StringMatchCandidate::new(14, "ISO 8859-15"),
+ StringMatchCandidate::new(15, "KOI8-R"),
+ StringMatchCandidate::new(16, "KOI8-U"),
+ StringMatchCandidate::new(17, "MacRoman"),
+ StringMatchCandidate::new(18, "Mac Cyrillic"),
+ StringMatchCandidate::new(19, "Windows-874"),
+ StringMatchCandidate::new(20, "Windows-1253"),
+ StringMatchCandidate::new(21, "Windows-1254"),
+ StringMatchCandidate::new(22, "Windows-1255"),
+ StringMatchCandidate::new(23, "Windows-1256"),
+ StringMatchCandidate::new(24, "Windows-1257"),
+ StringMatchCandidate::new(25, "Windows-1258"),
+ StringMatchCandidate::new(26, "Windows-949"),
+ StringMatchCandidate::new(27, "EUC-JP"),
+ StringMatchCandidate::new(28, "ISO 2022-JP"),
+ StringMatchCandidate::new(29, "GBK"),
+ StringMatchCandidate::new(30, "GB18030"),
+ StringMatchCandidate::new(31, "Big5"),
],
matches: Vec::new(),
selector,
@@ -3,7 +3,6 @@ use std::fmt::Debug;
use anyhow::Result;
use encoding_rs::Encoding;
-use serde::{Deserialize, de::Visitor};
/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
/// Since the reference is static, it is safe to send it across threads.
@@ -11,7 +10,7 @@ pub struct EncodingWrapper(&'static Encoding);
impl Debug for EncodingWrapper {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- f.debug_tuple("EncodingWrapper")
+ f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
.field(&self.0.name())
.finish()
}
@@ -19,37 +18,6 @@ impl Debug for EncodingWrapper {
pub struct EncodingWrapperVisitor;
-impl<'vi> Visitor<'vi> for EncodingWrapperVisitor {
- type Value = EncodingWrapper;
-
- fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
- formatter.write_str("a valid encoding name")
- }
-
- fn visit_str<E: serde::de::Error>(self, encoding: &str) -> Result<EncodingWrapper, E> {
- Ok(EncodingWrapper(
- Encoding::for_label(encoding.as_bytes())
- .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?,
- ))
- }
-
- fn visit_string<E: serde::de::Error>(self, encoding: String) -> Result<EncodingWrapper, E> {
- Ok(EncodingWrapper(
- Encoding::for_label(encoding.as_bytes())
- .ok_or_else(|| serde::de::Error::custom("Invalid Encoding"))?,
- ))
- }
-}
-
-impl<'de> Deserialize<'de> for EncodingWrapper {
- fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- deserializer.deserialize_str(EncodingWrapperVisitor)
- }
-}
-
impl PartialEq for EncodingWrapper {
fn eq(&self, other: &Self) -> bool {
self.0.name() == other.0.name()
@@ -71,19 +39,44 @@ impl EncodingWrapper {
}
pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
- let (cow, _encoding_used, _had_errors) = self.0.decode(&input);
- // encoding_rs handles invalid bytes by replacing them with replacement characters
+ let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
+ // `encoding_rs` handles invalid bytes by replacing them with replacement characters
// in the output string, so we return the result even if there were errors.
- // This preserves the original behavior where files with invalid bytes could still be opened.
+ // This preserves the original behaviour where files with invalid bytes could still be opened.
Ok(cow.into_owned())
}
pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
- let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
- // encoding_rs handles unencodable characters by replacing them with
- // appropriate substitutes in the output, so we return the result even if there were errors.
- // This maintains consistency with the decode behavior.
- Ok(cow.into_owned())
+ if self.0 == encoding_rs::UTF_16BE {
+ let mut data: Vec<u8> = vec![];
+ let utf = input.encode_utf16().collect::<Vec<u16>>();
+
+ for i in utf {
+ let byte = i.to_be_bytes();
+ for b in byte {
+ data.push(b);
+ }
+ }
+ return Ok(data);
+ } else if self.0 == encoding_rs::UTF_16LE {
+ let mut data: Vec<u8> = vec![];
+ let utf = input.encode_utf16().collect::<Vec<u16>>();
+
+ for i in utf {
+ let byte = i.to_le_bytes();
+ for b in byte {
+ data.push(b);
+ }
+ }
+ return Ok(data);
+ } else {
+ let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
+ println!("Encoding: {:?}", self);
+ // `encoding_rs` handles unencodable characters by replacing them with
+ // appropriate substitutes in the output, so we return the result even if there were errors.
+ // This maintains consistency with the decode behaviour.
+ Ok(cow.into_owned())
+ }
}
}
@@ -96,61 +89,3 @@ pub async fn to_utf8(input: Vec<u8>, encoding: EncodingWrapper) -> Result<String
pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
target.encode(input).await
}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use gpui::BackgroundExecutor;
-
- #[gpui::test]
- async fn test_decode_with_invalid_bytes(_: BackgroundExecutor) {
- // Test that files with invalid bytes can still be decoded
- // This is a regression test for the issue where files couldn't be opened
- // when they contained invalid bytes for the specified encoding
-
- // Create some invalid UTF-8 bytes
- let invalid_bytes = vec![0xFF, 0xFE, 0x00, 0x48]; // Invalid UTF-8 sequence
-
- let encoding = EncodingWrapper::new(encoding_rs::UTF_8);
- let result = encoding.decode(invalid_bytes).await;
-
- // The decode should succeed, not fail
- assert!(
- result.is_ok(),
- "Decode should succeed even with invalid bytes"
- );
-
- let decoded = result.unwrap();
- // The result should contain replacement characters for invalid sequences
- assert!(!decoded.is_empty(), "Decoded string should not be empty");
-
- // Test with Windows-1252 and some bytes that might be invalid
- let maybe_invalid_bytes = vec![0x81, 0x8D, 0x8F, 0x90, 0x9D]; // Some potentially problematic bytes
- let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252);
- let result = encoding.decode(maybe_invalid_bytes).await;
-
- // Should still succeed
- assert!(
- result.is_ok(),
- "Decode should succeed with Windows-1252 even with potentially invalid bytes"
- );
- }
-
- #[gpui::test]
- async fn test_encode_with_unencodable_chars(_: BackgroundExecutor) {
- // Test that strings with unencodable characters can still be encoded
- let input = "Hello δΈη π".to_string(); // Contains Unicode that may not encode to all formats
-
- let encoding = EncodingWrapper::new(encoding_rs::WINDOWS_1252);
- let result = encoding.encode(input).await;
-
- // The encode should succeed, not fail
- assert!(
- result.is_ok(),
- "Encode should succeed even with unencodable characters"
- );
-
- let encoded = result.unwrap();
- assert!(!encoded.is_empty(), "Encoded bytes should not be empty");
- }
-}