Correct UTF-16 saving and add heuristic encoding detection (#45243)

Ichimura Tomoo created 2 months ago

This commit fixes an issue where saving UTF-16 files resulted in UTF-8
bytes due to `encoding_rs` default behavior. It also introduces a
heuristic to detect BOM-less UTF-16 and binary files.

Changes:
- Manually implement UTF-16LE/BE encoding during file save to avoid
implicit UTF-8 conversion.
- Add `analyze_byte_content` to guess UTF-16LE/BE or Binary based on
null byte distribution.
- Prevent loading binary files as text by returning an error when binary
content is detected.

Special thanks to @CrazyboyQCD for pointing out the `encoding_rs`
behavior and providing the fix, and to @ConradIrwin for the suggestion
on the detection heuristic.

Closes #14654

Release Notes:

- (nightly only) Fixed an issue where saving files with UTF-16 encoding
incorrectly wrote them as UTF-8. Also improved detection for binary
files and BOM-less UTF-16.

Change summary

crates/language/src/buffer.rs         |  18 +
crates/worktree/src/worktree.rs       | 140 ++++++++++--
crates/worktree/src/worktree_tests.rs | 306 +++++++++++++++++++---------
3 files changed, 331 insertions(+), 133 deletions(-)

Detailed changes

crates/language/src/buffer.rs 🔗

@@ -1490,19 +1490,23 @@ impl Buffer {
         let (tx, rx) = futures::channel::oneshot::channel();
         let prev_version = self.text.version();
         self.reload_task = Some(cx.spawn(async move |this, cx| {
-            let Some((new_mtime, new_text)) = this.update(cx, |this, cx| {
+            let Some((new_mtime, load_bytes_task, encoding)) = this.update(cx, |this, cx| {
                 let file = this.file.as_ref()?.as_local()?;
-
-                Some((file.disk_state().mtime(), file.load(cx)))
+                Some((
+                    file.disk_state().mtime(),
+                    file.load_bytes(cx),
+                    this.encoding,
+                ))
             })?
             else {
                 return Ok(());
             };
 
-            let new_text = new_text.await?;
-            let diff = this
-                .update(cx, |this, cx| this.diff(new_text.clone(), cx))?
-                .await;
+            let bytes = load_bytes_task.await?;
+            let (cow, _encoding_used, _has_errors) = encoding.decode(&bytes);
+            let new_text = cow.into_owned();
+
+            let diff = this.update(cx, |this, cx| this.diff(new_text, cx))?.await;
             this.update(cx, |this, cx| {
                 if this.version() == diff.base_version {
                     this.finalize_last_transaction();

crates/worktree/src/worktree.rs 🔗

@@ -1361,7 +1361,7 @@ impl LocalWorktree {
             }
 
             let content = fs.load_bytes(&abs_path).await?;
-            let (text, encoding, has_bom) = decode_byte(content);
+            let (text, encoding, has_bom) = decode_byte(content)?;
 
             let worktree = this.upgrade().context("worktree was dropped")?;
             let file = match entry.await? {
@@ -1489,25 +1489,12 @@ impl LocalWorktree {
             let fs = fs.clone();
             let abs_path = abs_path.clone();
             async move {
-                let bom_bytes = if has_bom {
-                    if encoding == encoding_rs::UTF_16LE {
-                        vec![0xFF, 0xFE]
-                    } else if encoding == encoding_rs::UTF_16BE {
-                        vec![0xFE, 0xFF]
-                    } else if encoding == encoding_rs::UTF_8 {
-                        vec![0xEF, 0xBB, 0xBF]
-                    } else {
-                        vec![]
-                    }
-                } else {
-                    vec![]
-                };
-
                 // For UTF-8, use the optimized `fs.save` which writes Rope chunks directly to disk
                 // without allocating a contiguous string.
                 if encoding == encoding_rs::UTF_8 && !has_bom {
                     return fs.save(&abs_path, &text, line_ending).await;
                 }
+
                 // For legacy encodings (e.g. Shift-JIS), we fall back to converting the entire Rope
                 // to a String/Bytes in memory before writing.
                 //
@@ -1520,13 +1507,45 @@ impl LocalWorktree {
                     LineEnding::Windows => text_string.replace('\n', "\r\n"),
                 };
 
-                let (cow, _, _) = encoding.encode(&normalized_text);
-                let bytes = if !bom_bytes.is_empty() {
-                    let mut bytes = bom_bytes;
-                    bytes.extend_from_slice(&cow);
-                    bytes.into()
+                // Create the byte vector manually for UTF-16 encodings because encoding_rs encodes to UTF-8 by default (per WHATWG standards),
+                //  which is not what we want for saving files.
+                let bytes = if encoding == encoding_rs::UTF_16BE {
+                    let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
+                    if has_bom {
+                        data.extend_from_slice(&[0xFE, 0xFF]); // BOM
+                    }
+                    let utf16be_bytes =
+                        normalized_text.encode_utf16().flat_map(|u| u.to_be_bytes());
+                    data.extend(utf16be_bytes);
+                    data.into()
+                } else if encoding == encoding_rs::UTF_16LE {
+                    let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
+                    if has_bom {
+                        data.extend_from_slice(&[0xFF, 0xFE]); // BOM
+                    }
+                    let utf16le_bytes =
+                        normalized_text.encode_utf16().flat_map(|u| u.to_le_bytes());
+                    data.extend(utf16le_bytes);
+                    data.into()
                 } else {
-                    cow
+                    // For other encodings (Shift-JIS, UTF-8 with BOM, etc.), delegate to encoding_rs.
+                    let bom_bytes = if has_bom {
+                        if encoding == encoding_rs::UTF_8 {
+                            vec![0xEF, 0xBB, 0xBF]
+                        } else {
+                            vec![]
+                        }
+                    } else {
+                        vec![]
+                    };
+                    let (cow, _, _) = encoding.encode(&normalized_text);
+                    if !bom_bytes.is_empty() {
+                        let mut bytes = bom_bytes;
+                        bytes.extend_from_slice(&cow);
+                        bytes.into()
+                    } else {
+                        cow
+                    }
                 };
 
                 fs.write(&abs_path, &bytes).await
@@ -5842,11 +5861,28 @@ impl fs::Watcher for NullWatcher {
     }
 }
 
-fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
+fn decode_byte(bytes: Vec<u8>) -> anyhow::Result<(String, &'static Encoding, bool)> {
     // check BOM
     if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) {
         let (cow, _) = encoding.decode_with_bom_removal(&bytes);
-        return (cow.into_owned(), encoding, true);
+        return Ok((cow.into_owned(), encoding, true));
+    }
+
+    match analyze_byte_content(&bytes) {
+        ByteContent::Utf16Le => {
+            let encoding = encoding_rs::UTF_16LE;
+            let (cow, _, _) = encoding.decode(&bytes);
+            return Ok((cow.into_owned(), encoding, false));
+        }
+        ByteContent::Utf16Be => {
+            let encoding = encoding_rs::UTF_16BE;
+            let (cow, _, _) = encoding.decode(&bytes);
+            return Ok((cow.into_owned(), encoding, false));
+        }
+        ByteContent::Binary => {
+            anyhow::bail!("Binary files are not supported");
+        }
+        ByteContent::Unknown => {}
     }
 
     fn detect_encoding(bytes: Vec<u8>) -> (String, &'static Encoding) {
@@ -5867,14 +5903,66 @@ fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
             // displaying raw escape sequences instead of the correct characters.
             if text.contains('\x1b') {
                 let (s, enc) = detect_encoding(text.into_bytes());
-                (s, enc, false)
+                Ok((s, enc, false))
             } else {
-                (text, encoding_rs::UTF_8, false)
+                Ok((text, encoding_rs::UTF_8, false))
             }
         }
         Err(e) => {
             let (s, enc) = detect_encoding(e.into_bytes());
-            (s, enc, false)
+            Ok((s, enc, false))
         }
     }
 }
+
+#[derive(PartialEq)]
+enum ByteContent {
+    Utf16Le,
+    Utf16Be,
+    Binary,
+    Unknown,
+}
+// Heuristic check using null byte distribution.
+// NOTE: This relies on the presence of ASCII characters (which become `0x00` in UTF-16).
+// Files consisting purely of non-ASCII characters (like Japanese) may not be detected here
+// and will result in `Unknown`.
+fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
+    if bytes.len() < 2 {
+        return ByteContent::Unknown;
+    }
+
+    let check_len = bytes.len().min(1024);
+    let sample = &bytes[..check_len];
+
+    if !sample.contains(&0) {
+        return ByteContent::Unknown;
+    }
+
+    let mut even_nulls = 0;
+    let mut odd_nulls = 0;
+
+    for (i, &byte) in sample.iter().enumerate() {
+        if byte == 0 {
+            if i % 2 == 0 {
+                even_nulls += 1;
+            } else {
+                odd_nulls += 1;
+            }
+        }
+    }
+
+    let total_nulls = even_nulls + odd_nulls;
+    if total_nulls < check_len / 10 {
+        return ByteContent::Unknown;
+    }
+
+    if even_nulls > odd_nulls * 4 {
+        return ByteContent::Utf16Be;
+    }
+
+    if odd_nulls > even_nulls * 4 {
+        return ByteContent::Utf16Le;
+    }
+
+    ByteContent::Binary
+}

crates/worktree/src/worktree_tests.rs 🔗

@@ -1,5 +1,5 @@
 use crate::{Entry, EntryKind, Event, PathChange, Worktree, WorktreeModelHandle};
-use anyhow::{Context as _, Result};
+use anyhow::Result;
 use encoding_rs;
 use fs::{FakeFs, Fs, RealFs, RemoveOptions};
 use git::{DOT_GIT, GITIGNORE, REPO_EXCLUDE};
@@ -2568,71 +2568,87 @@ fn init_test(cx: &mut gpui::TestAppContext) {
 #[gpui::test]
 async fn test_load_file_encoding(cx: &mut TestAppContext) {
     init_test(cx);
-    let test_cases: Vec<(&str, &[u8], &str)> = vec![
-        ("utf8.txt", "こんにちは".as_bytes(), "こんにちは"), // "こんにちは" is Japanese "Hello"
-        (
-            "sjis.txt",
-            &[0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
-            "こんにちは",
-        ),
-        (
-            "eucjp.txt",
-            &[0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
-            "こんにちは",
-        ),
-        (
-            "iso2022jp.txt",
-            &[
+
+    struct TestCase {
+        name: &'static str,
+        bytes: Vec<u8>,
+        expected_text: &'static str,
+    }
+
+    // --- Success Cases ---
+    let success_cases = vec![
+        TestCase {
+            name: "utf8.txt",
+            bytes: "こんにちは".as_bytes().to_vec(),
+            expected_text: "こんにちは",
+        },
+        TestCase {
+            name: "sjis.txt",
+            bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
+            expected_text: "こんにちは",
+        },
+        TestCase {
+            name: "eucjp.txt",
+            bytes: vec![0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
+            expected_text: "こんにちは",
+        },
+        TestCase {
+            name: "iso2022jp.txt",
+            bytes: vec![
                 0x1b, 0x24, 0x42, 0x24, 0x33, 0x24, 0x73, 0x24, 0x4b, 0x24, 0x41, 0x24, 0x4f, 0x1b,
                 0x28, 0x42,
             ],
-            "こんにちは",
-        ),
-        // Western Europe (Windows-1252)
-        // "Café" -> 0xE9 is 'é' in Windows-1252 (it is typically 0xC3 0xA9 in UTF-8)
-        ("win1252.txt", &[0x43, 0x61, 0x66, 0xe9], "Café"),
-        // Chinese Simplified (GBK)
-        // Note: We use a slightly longer string here because short byte sequences can be ambiguous
-        // in multi-byte encodings. Providing more context helps the heuristic detector guess correctly.
-        // Text: "今天天气不错" (Today's weather is not bad / nice)
-        // Bytes:
-        //   今: BD F1
-        //   天: CC EC
-        //   天: CC EC
-        //   气: C6 F8
-        //   不: B2 BB
-        //   错: B4 ED
-        (
-            "gbk.txt",
-            &[
+            expected_text: "こんにちは",
+        },
+        TestCase {
+            name: "win1252.txt",
+            bytes: vec![0x43, 0x61, 0x66, 0xe9],
+            expected_text: "Café",
+        },
+        TestCase {
+            name: "gbk.txt",
+            bytes: vec![
                 0xbd, 0xf1, 0xcc, 0xec, 0xcc, 0xec, 0xc6, 0xf8, 0xb2, 0xbb, 0xb4, 0xed,
             ],
-            "今天天气不错",
-        ),
-        (
-            "utf16le_bom.txt",
-            &[
+            expected_text: "今天天气不错",
+        },
+        // UTF-16LE with BOM
+        TestCase {
+            name: "utf16le_bom.txt",
+            bytes: vec![
                 0xFF, 0xFE, // BOM
-                0x53, 0x30, // こ
-                0x93, 0x30, // ん
-                0x6B, 0x30, // に
-                0x61, 0x30, // ち
-                0x6F, 0x30, // は
+                0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30,
             ],
-            "こんにちは",
-        ),
-        (
-            "utf8_bom.txt",
-            &[
-                0xEF, 0xBB, 0xBF, // UTF-8 BOM
-                0xE3, 0x81, 0x93, // こ
-                0xE3, 0x82, 0x93, // ん
-                0xE3, 0x81, 0xAB, // に
-                0xE3, 0x81, 0xA1, // ち
-                0xE3, 0x81, 0xAF, // は
+            expected_text: "こんにちは",
+        },
+        // UTF-16BE with BOM
+        TestCase {
+            name: "utf16be_bom.txt",
+            bytes: vec![
+                0xFE, 0xFF, // BOM
+                0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F,
             ],
-            "こんにちは",
-        ),
+            expected_text: "こんにちは",
+        },
+        // UTF-16LE without BOM (ASCII only)
+        // This relies on the "null byte heuristic" we implemented.
+        // "ABC" -> 41 00 42 00 43 00
+        TestCase {
+            name: "utf16le_ascii_no_bom.txt",
+            bytes: vec![0x41, 0x00, 0x42, 0x00, 0x43, 0x00],
+            expected_text: "ABC",
+        },
+    ];
+
+    // --- Failure Cases ---
+    let failure_cases = vec![
+        // Binary File (Should be detected by heuristic and return Error)
+        // Contains random bytes and mixed nulls that don't match UTF-16 patterns
+        TestCase {
+            name: "binary.bin",
+            bytes: vec![0x00, 0xFF, 0x12, 0x00, 0x99, 0x88, 0x77, 0x66, 0x00],
+            expected_text: "", // Not used
+        },
     ];
 
     let root_path = if cfg!(windows) {
@@ -2642,15 +2658,11 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
     };
 
     let fs = FakeFs::new(cx.background_executor.clone());
+    fs.create_dir(root_path).await.unwrap();
 
-    let mut files_json = serde_json::Map::new();
-    for (name, _, _) in &test_cases {
-        files_json.insert(name.to_string(), serde_json::Value::String("".to_string()));
-    }
-
-    for (name, bytes, _) in &test_cases {
-        let path = root_path.join(name);
-        fs.write(&path, bytes).await.unwrap();
+    for case in success_cases.iter().chain(failure_cases.iter()) {
+        let path = root_path.join(case.name);
+        fs.write(&path, &case.bytes).await.unwrap();
     }
 
     let tree = Worktree::local(
@@ -2667,34 +2679,54 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
     cx.read(|cx| tree.read(cx).as_local().unwrap().scan_complete())
         .await;
 
-    for (name, _, expected) in test_cases {
-        let loaded = tree
-            .update(cx, |tree, cx| tree.load_file(rel_path(name), cx))
-            .await
-            .with_context(|| format!("Failed to load {}", name))
-            .unwrap();
+    let rel_path = |name: &str| {
+        RelPath::new(&Path::new(name), PathStyle::local())
+            .unwrap()
+            .into_arc()
+    };
 
+    // Run Success Tests
+    for case in success_cases {
+        let loaded = tree
+            .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
+            .await;
+        if let Err(e) = &loaded {
+            panic!("Failed to load success case '{}': {:?}", case.name, e);
+        }
+        let loaded = loaded.unwrap();
         assert_eq!(
-            loaded.text, expected,
+            loaded.text, case.expected_text,
             "Encoding mismatch for file: {}",
-            name
+            case.name
         );
     }
+
+    // Run Failure Tests
+    for case in failure_cases {
+        let loaded = tree
+            .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
+            .await;
+        assert!(
+            loaded.is_err(),
+            "Failure case '{}' unexpectedly succeeded! It should have been detected as binary.",
+            case.name
+        );
+        let err_msg = loaded.unwrap_err().to_string();
+        println!("Got expected error for {}: {}", case.name, err_msg);
+    }
 }
 
 #[gpui::test]
 async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
     init_test(cx);
     let fs = FakeFs::new(cx.executor());
+
     let root_path = if cfg!(windows) {
         Path::new("C:\\root")
     } else {
         Path::new("/root")
     };
     fs.create_dir(root_path).await.unwrap();
-    let file_path = root_path.join("test.txt");
-
-    fs.insert_file(&file_path, "initial".into()).await;
 
     let worktree = Worktree::local(
         root_path,
@@ -2707,33 +2739,107 @@ async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
     .await
     .unwrap();
 
-    let path: Arc<Path> = Path::new("test.txt").into();
-    let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
+    // Define test case structure
+    struct TestCase {
+        name: &'static str,
+        text: &'static str,
+        encoding: &'static encoding_rs::Encoding,
+        has_bom: bool,
+        expected_bytes: Vec<u8>,
+    }
 
-    let text = text::Rope::from("こんにちは");
+    let cases = vec![
+        // Shift_JIS with Japanese
+        TestCase {
+            name: "Shift_JIS with Japanese",
+            text: "こんにちは",
+            encoding: encoding_rs::SHIFT_JIS,
+            has_bom: false,
+            expected_bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
+        },
+        // UTF-8 No BOM
+        TestCase {
+            name: "UTF-8 No BOM",
+            text: "AB",
+            encoding: encoding_rs::UTF_8,
+            has_bom: false,
+            expected_bytes: vec![0x41, 0x42],
+        },
+        // UTF-8 with BOM
+        TestCase {
+            name: "UTF-8 with BOM",
+            text: "AB",
+            encoding: encoding_rs::UTF_8,
+            has_bom: true,
+            expected_bytes: vec![0xEF, 0xBB, 0xBF, 0x41, 0x42],
+        },
+        // UTF-16LE No BOM with Japanese
+        // NOTE: This passes thanks to the manual encoding fix implemented in `write_file`.
+        TestCase {
+            name: "UTF-16LE No BOM with Japanese",
+            text: "こんにちは",
+            encoding: encoding_rs::UTF_16LE,
+            has_bom: false,
+            expected_bytes: vec![0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f, 0x30],
+        },
+        // UTF-16LE with BOM
+        TestCase {
+            name: "UTF-16LE with BOM",
+            text: "A",
+            encoding: encoding_rs::UTF_16LE,
+            has_bom: true,
+            expected_bytes: vec![0xFF, 0xFE, 0x41, 0x00],
+        },
+        // UTF-16BE No BOM with Japanese
+        // NOTE: This passes thanks to the manual encoding fix.
+        TestCase {
+            name: "UTF-16BE No BOM with Japanese",
+            text: "こんにちは",
+            encoding: encoding_rs::UTF_16BE,
+            has_bom: false,
+            expected_bytes: vec![0x30, 0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f],
+        },
+        // UTF-16BE with BOM
+        TestCase {
+            name: "UTF-16BE with BOM",
+            text: "A",
+            encoding: encoding_rs::UTF_16BE,
+            has_bom: true,
+            expected_bytes: vec![0xFE, 0xFF, 0x00, 0x41],
+        },
+    ];
 
-    let task = worktree.update(cx, |wt, cx| {
-        wt.write_file(
-            rel_path,
-            text,
-            text::LineEnding::Unix,
-            encoding_rs::SHIFT_JIS,
-            false,
-            cx,
-        )
-    });
+    for (i, case) in cases.into_iter().enumerate() {
+        let file_name = format!("test_{}.txt", i);
+        let path: Arc<Path> = Path::new(&file_name).into();
+        let file_path = root_path.join(&file_name);
 
-    task.await.unwrap();
+        fs.insert_file(&file_path, "".into()).await;
 
-    let bytes = fs.load_bytes(&file_path).await.unwrap();
+        let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
+        let text = text::Rope::from(case.text);
 
-    let expected_bytes = vec![
-        0x82, 0xb1, // こ
-        0x82, 0xf1, // ん
-        0x82, 0xc9, // に
-        0x82, 0xbf, // ち
-        0x82, 0xcd, // は
-    ];
+        let task = worktree.update(cx, |wt, cx| {
+            wt.write_file(
+                rel_path,
+                text,
+                text::LineEnding::Unix,
+                case.encoding,
+                case.has_bom,
+                cx,
+            )
+        });
+
+        if let Err(e) = task.await {
+            panic!("Unexpected error in case '{}': {:?}", case.name, e);
+        }
+
+        let bytes = fs.load_bytes(&file_path).await.unwrap();
 
-    assert_eq!(bytes, expected_bytes, "Should be saved as Shift-JIS");
+        assert_eq!(
+            bytes, case.expected_bytes,
+            "case '{}' mismatch. Expected {:?}, but got {:?}",
+            case.name, case.expected_bytes, bytes
+        );
+    }
 }