@@ -1361,7 +1361,7 @@ impl LocalWorktree {
}
let content = fs.load_bytes(&abs_path).await?;
- let (text, encoding, has_bom) = decode_byte(content);
+ let (text, encoding, has_bom) = decode_byte(content)?;
let worktree = this.upgrade().context("worktree was dropped")?;
let file = match entry.await? {
@@ -1489,25 +1489,12 @@ impl LocalWorktree {
let fs = fs.clone();
let abs_path = abs_path.clone();
async move {
- let bom_bytes = if has_bom {
- if encoding == encoding_rs::UTF_16LE {
- vec![0xFF, 0xFE]
- } else if encoding == encoding_rs::UTF_16BE {
- vec![0xFE, 0xFF]
- } else if encoding == encoding_rs::UTF_8 {
- vec![0xEF, 0xBB, 0xBF]
- } else {
- vec![]
- }
- } else {
- vec![]
- };
-
// For UTF-8, use the optimized `fs.save` which writes Rope chunks directly to disk
// without allocating a contiguous string.
if encoding == encoding_rs::UTF_8 && !has_bom {
return fs.save(&abs_path, &text, line_ending).await;
}
+
// For legacy encodings (e.g. Shift-JIS), we fall back to converting the entire Rope
// to a String/Bytes in memory before writing.
//
@@ -1520,13 +1507,45 @@ impl LocalWorktree {
LineEnding::Windows => text_string.replace('\n', "\r\n"),
};
- let (cow, _, _) = encoding.encode(&normalized_text);
- let bytes = if !bom_bytes.is_empty() {
- let mut bytes = bom_bytes;
- bytes.extend_from_slice(&cow);
- bytes.into()
+ // Create the byte vector manually for UTF-16 encodings because encoding_rs encodes to UTF-8 by default (per WHATWG standards),
+ // which is not what we want for saving files.
+ let bytes = if encoding == encoding_rs::UTF_16BE {
+ let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
+ if has_bom {
+ data.extend_from_slice(&[0xFE, 0xFF]); // BOM
+ }
+ let utf16be_bytes =
+ normalized_text.encode_utf16().flat_map(|u| u.to_be_bytes());
+ data.extend(utf16be_bytes);
+ data.into()
+ } else if encoding == encoding_rs::UTF_16LE {
+ let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2);
+ if has_bom {
+ data.extend_from_slice(&[0xFF, 0xFE]); // BOM
+ }
+ let utf16le_bytes =
+ normalized_text.encode_utf16().flat_map(|u| u.to_le_bytes());
+ data.extend(utf16le_bytes);
+ data.into()
} else {
- cow
+ // For other encodings (Shift-JIS, UTF-8 with BOM, etc.), delegate to encoding_rs.
+ let bom_bytes = if has_bom {
+ if encoding == encoding_rs::UTF_8 {
+ vec![0xEF, 0xBB, 0xBF]
+ } else {
+ vec![]
+ }
+ } else {
+ vec![]
+ };
+ let (cow, _, _) = encoding.encode(&normalized_text);
+ if !bom_bytes.is_empty() {
+ let mut bytes = bom_bytes;
+ bytes.extend_from_slice(&cow);
+ bytes.into()
+ } else {
+ cow
+ }
};
fs.write(&abs_path, &bytes).await
@@ -5842,11 +5861,28 @@ impl fs::Watcher for NullWatcher {
}
}
-fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
+fn decode_byte(bytes: Vec<u8>) -> anyhow::Result<(String, &'static Encoding, bool)> {
// check BOM
if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) {
let (cow, _) = encoding.decode_with_bom_removal(&bytes);
- return (cow.into_owned(), encoding, true);
+ return Ok((cow.into_owned(), encoding, true));
+ }
+
+ match analyze_byte_content(&bytes) {
+ ByteContent::Utf16Le => {
+ let encoding = encoding_rs::UTF_16LE;
+ let (cow, _, _) = encoding.decode(&bytes);
+ return Ok((cow.into_owned(), encoding, false));
+ }
+ ByteContent::Utf16Be => {
+ let encoding = encoding_rs::UTF_16BE;
+ let (cow, _, _) = encoding.decode(&bytes);
+ return Ok((cow.into_owned(), encoding, false));
+ }
+ ByteContent::Binary => {
+ anyhow::bail!("Binary files are not supported");
+ }
+ ByteContent::Unknown => {}
}
fn detect_encoding(bytes: Vec<u8>) -> (String, &'static Encoding) {
@@ -5867,14 +5903,66 @@ fn decode_byte(bytes: Vec<u8>) -> (String, &'static Encoding, bool) {
// displaying raw escape sequences instead of the correct characters.
if text.contains('\x1b') {
let (s, enc) = detect_encoding(text.into_bytes());
- (s, enc, false)
+ Ok((s, enc, false))
} else {
- (text, encoding_rs::UTF_8, false)
+ Ok((text, encoding_rs::UTF_8, false))
}
}
Err(e) => {
let (s, enc) = detect_encoding(e.into_bytes());
- (s, enc, false)
+ Ok((s, enc, false))
}
}
}
+
+#[derive(PartialEq)]
+enum ByteContent {
+ Utf16Le,
+ Utf16Be,
+ Binary,
+ Unknown,
+}
+// Heuristic check using null byte distribution.
+// NOTE: This relies on the presence of ASCII characters (which become `0x00` in UTF-16).
+// Files consisting purely of non-ASCII characters (like Japanese) may not be detected here
+// and will result in `Unknown`.
+fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
+ if bytes.len() < 2 {
+ return ByteContent::Unknown;
+ }
+
+ let check_len = bytes.len().min(1024);
+ let sample = &bytes[..check_len];
+
+ if !sample.contains(&0) {
+ return ByteContent::Unknown;
+ }
+
+ let mut even_nulls = 0;
+ let mut odd_nulls = 0;
+
+ for (i, &byte) in sample.iter().enumerate() {
+ if byte == 0 {
+ if i % 2 == 0 {
+ even_nulls += 1;
+ } else {
+ odd_nulls += 1;
+ }
+ }
+ }
+
+ let total_nulls = even_nulls + odd_nulls;
+ if total_nulls < check_len / 10 {
+ return ByteContent::Unknown;
+ }
+
+ if even_nulls > odd_nulls * 4 {
+ return ByteContent::Utf16Be;
+ }
+
+ if odd_nulls > even_nulls * 4 {
+ return ByteContent::Utf16Le;
+ }
+
+ ByteContent::Binary
+}
@@ -1,5 +1,5 @@
use crate::{Entry, EntryKind, Event, PathChange, Worktree, WorktreeModelHandle};
-use anyhow::{Context as _, Result};
+use anyhow::Result;
use encoding_rs;
use fs::{FakeFs, Fs, RealFs, RemoveOptions};
use git::{DOT_GIT, GITIGNORE, REPO_EXCLUDE};
@@ -2568,71 +2568,87 @@ fn init_test(cx: &mut gpui::TestAppContext) {
#[gpui::test]
async fn test_load_file_encoding(cx: &mut TestAppContext) {
init_test(cx);
- let test_cases: Vec<(&str, &[u8], &str)> = vec![
- ("utf8.txt", "γγγ«γ‘γ―".as_bytes(), "γγγ«γ‘γ―"), // "γγγ«γ‘γ―" is Japanese "Hello"
- (
- "sjis.txt",
- &[0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
- "γγγ«γ‘γ―",
- ),
- (
- "eucjp.txt",
- &[0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
- "γγγ«γ‘γ―",
- ),
- (
- "iso2022jp.txt",
- &[
+
+ struct TestCase {
+ name: &'static str,
+ bytes: Vec<u8>,
+ expected_text: &'static str,
+ }
+
+ // --- Success Cases ---
+ let success_cases = vec![
+ TestCase {
+ name: "utf8.txt",
+ bytes: "γγγ«γ‘γ―".as_bytes().to_vec(),
+ expected_text: "γγγ«γ‘γ―",
+ },
+ TestCase {
+ name: "sjis.txt",
+ bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
+ expected_text: "γγγ«γ‘γ―",
+ },
+ TestCase {
+ name: "eucjp.txt",
+ bytes: vec![0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf],
+ expected_text: "γγγ«γ‘γ―",
+ },
+ TestCase {
+ name: "iso2022jp.txt",
+ bytes: vec![
0x1b, 0x24, 0x42, 0x24, 0x33, 0x24, 0x73, 0x24, 0x4b, 0x24, 0x41, 0x24, 0x4f, 0x1b,
0x28, 0x42,
],
- "γγγ«γ‘γ―",
- ),
- // Western Europe (Windows-1252)
- // "CafΓ©" -> 0xE9 is 'Γ©' in Windows-1252 (it is typically 0xC3 0xA9 in UTF-8)
- ("win1252.txt", &[0x43, 0x61, 0x66, 0xe9], "CafΓ©"),
- // Chinese Simplified (GBK)
- // Note: We use a slightly longer string here because short byte sequences can be ambiguous
- // in multi-byte encodings. Providing more context helps the heuristic detector guess correctly.
- // Text: "δ»ε€©ε€©ζ°δΈι" (Today's weather is not bad / nice)
- // Bytes:
- // δ»: BD F1
- // 倩: CC EC
- // 倩: CC EC
- // ζ°: C6 F8
- // δΈ: B2 BB
- // ι: B4 ED
- (
- "gbk.txt",
- &[
+ expected_text: "γγγ«γ‘γ―",
+ },
+ TestCase {
+ name: "win1252.txt",
+ bytes: vec![0x43, 0x61, 0x66, 0xe9],
+ expected_text: "CafΓ©",
+ },
+ TestCase {
+ name: "gbk.txt",
+ bytes: vec![
0xbd, 0xf1, 0xcc, 0xec, 0xcc, 0xec, 0xc6, 0xf8, 0xb2, 0xbb, 0xb4, 0xed,
],
- "δ»ε€©ε€©ζ°δΈι",
- ),
- (
- "utf16le_bom.txt",
- &[
+ expected_text: "δ»ε€©ε€©ζ°δΈι",
+ },
+ // UTF-16LE with BOM
+ TestCase {
+ name: "utf16le_bom.txt",
+ bytes: vec![
0xFF, 0xFE, // BOM
- 0x53, 0x30, // γ
- 0x93, 0x30, // γ
- 0x6B, 0x30, // γ«
- 0x61, 0x30, // γ‘
- 0x6F, 0x30, // γ―
+ 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30,
],
- "γγγ«γ‘γ―",
- ),
- (
- "utf8_bom.txt",
- &[
- 0xEF, 0xBB, 0xBF, // UTF-8 BOM
- 0xE3, 0x81, 0x93, // γ
- 0xE3, 0x82, 0x93, // γ
- 0xE3, 0x81, 0xAB, // γ«
- 0xE3, 0x81, 0xA1, // γ‘
- 0xE3, 0x81, 0xAF, // γ―
+ expected_text: "γγγ«γ‘γ―",
+ },
+ // UTF-16BE with BOM
+ TestCase {
+ name: "utf16be_bom.txt",
+ bytes: vec![
+ 0xFE, 0xFF, // BOM
+ 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F,
],
- "γγγ«γ‘γ―",
- ),
+ expected_text: "γγγ«γ‘γ―",
+ },
+ // UTF-16LE without BOM (ASCII only)
+ // This relies on the "null byte heuristic" we implemented.
+ // "ABC" -> 41 00 42 00 43 00
+ TestCase {
+ name: "utf16le_ascii_no_bom.txt",
+ bytes: vec![0x41, 0x00, 0x42, 0x00, 0x43, 0x00],
+ expected_text: "ABC",
+ },
+ ];
+
+ // --- Failure Cases ---
+ let failure_cases = vec![
+ // Binary File (Should be detected by heuristic and return Error)
+ // Contains random bytes and mixed nulls that don't match UTF-16 patterns
+ TestCase {
+ name: "binary.bin",
+ bytes: vec![0x00, 0xFF, 0x12, 0x00, 0x99, 0x88, 0x77, 0x66, 0x00],
+ expected_text: "", // Not used
+ },
];
let root_path = if cfg!(windows) {
@@ -2642,15 +2658,11 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
};
let fs = FakeFs::new(cx.background_executor.clone());
+ fs.create_dir(root_path).await.unwrap();
- let mut files_json = serde_json::Map::new();
- for (name, _, _) in &test_cases {
- files_json.insert(name.to_string(), serde_json::Value::String("".to_string()));
- }
-
- for (name, bytes, _) in &test_cases {
- let path = root_path.join(name);
- fs.write(&path, bytes).await.unwrap();
+ for case in success_cases.iter().chain(failure_cases.iter()) {
+ let path = root_path.join(case.name);
+ fs.write(&path, &case.bytes).await.unwrap();
}
let tree = Worktree::local(
@@ -2667,34 +2679,54 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) {
cx.read(|cx| tree.read(cx).as_local().unwrap().scan_complete())
.await;
- for (name, _, expected) in test_cases {
- let loaded = tree
- .update(cx, |tree, cx| tree.load_file(rel_path(name), cx))
- .await
- .with_context(|| format!("Failed to load {}", name))
- .unwrap();
+ let rel_path = |name: &str| {
+ RelPath::new(&Path::new(name), PathStyle::local())
+ .unwrap()
+ .into_arc()
+ };
+ // Run Success Tests
+ for case in success_cases {
+ let loaded = tree
+ .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
+ .await;
+ if let Err(e) = &loaded {
+ panic!("Failed to load success case '{}': {:?}", case.name, e);
+ }
+ let loaded = loaded.unwrap();
assert_eq!(
- loaded.text, expected,
+ loaded.text, case.expected_text,
"Encoding mismatch for file: {}",
- name
+ case.name
);
}
+
+ // Run Failure Tests
+ for case in failure_cases {
+ let loaded = tree
+ .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx))
+ .await;
+ assert!(
+ loaded.is_err(),
+ "Failure case '{}' unexpectedly succeeded! It should have been detected as binary.",
+ case.name
+ );
+ let err_msg = loaded.unwrap_err().to_string();
+ println!("Got expected error for {}: {}", case.name, err_msg);
+ }
}
#[gpui::test]
async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
init_test(cx);
let fs = FakeFs::new(cx.executor());
+
let root_path = if cfg!(windows) {
Path::new("C:\\root")
} else {
Path::new("/root")
};
fs.create_dir(root_path).await.unwrap();
- let file_path = root_path.join("test.txt");
-
- fs.insert_file(&file_path, "initial".into()).await;
let worktree = Worktree::local(
root_path,
@@ -2707,33 +2739,107 @@ async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) {
.await
.unwrap();
- let path: Arc<Path> = Path::new("test.txt").into();
- let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
+ // Define test case structure
+ struct TestCase {
+ name: &'static str,
+ text: &'static str,
+ encoding: &'static encoding_rs::Encoding,
+ has_bom: bool,
+ expected_bytes: Vec<u8>,
+ }
- let text = text::Rope::from("γγγ«γ‘γ―");
+ let cases = vec![
+ // Shift_JIS with Japanese
+ TestCase {
+ name: "Shift_JIS with Japanese",
+ text: "γγγ«γ‘γ―",
+ encoding: encoding_rs::SHIFT_JIS,
+ has_bom: false,
+ expected_bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd],
+ },
+ // UTF-8 No BOM
+ TestCase {
+ name: "UTF-8 No BOM",
+ text: "AB",
+ encoding: encoding_rs::UTF_8,
+ has_bom: false,
+ expected_bytes: vec![0x41, 0x42],
+ },
+ // UTF-8 with BOM
+ TestCase {
+ name: "UTF-8 with BOM",
+ text: "AB",
+ encoding: encoding_rs::UTF_8,
+ has_bom: true,
+ expected_bytes: vec![0xEF, 0xBB, 0xBF, 0x41, 0x42],
+ },
+ // UTF-16LE No BOM with Japanese
+ // NOTE: This passes thanks to the manual encoding fix implemented in `write_file`.
+ TestCase {
+ name: "UTF-16LE No BOM with Japanese",
+ text: "γγγ«γ‘γ―",
+ encoding: encoding_rs::UTF_16LE,
+ has_bom: false,
+ expected_bytes: vec![0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f, 0x30],
+ },
+ // UTF-16LE with BOM
+ TestCase {
+ name: "UTF-16LE with BOM",
+ text: "A",
+ encoding: encoding_rs::UTF_16LE,
+ has_bom: true,
+ expected_bytes: vec![0xFF, 0xFE, 0x41, 0x00],
+ },
+ // UTF-16BE No BOM with Japanese
+ // NOTE: This passes thanks to the manual encoding fix.
+ TestCase {
+ name: "UTF-16BE No BOM with Japanese",
+ text: "γγγ«γ‘γ―",
+ encoding: encoding_rs::UTF_16BE,
+ has_bom: false,
+ expected_bytes: vec![0x30, 0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f],
+ },
+ // UTF-16BE with BOM
+ TestCase {
+ name: "UTF-16BE with BOM",
+ text: "A",
+ encoding: encoding_rs::UTF_16BE,
+ has_bom: true,
+ expected_bytes: vec![0xFE, 0xFF, 0x00, 0x41],
+ },
+ ];
- let task = worktree.update(cx, |wt, cx| {
- wt.write_file(
- rel_path,
- text,
- text::LineEnding::Unix,
- encoding_rs::SHIFT_JIS,
- false,
- cx,
- )
- });
+ for (i, case) in cases.into_iter().enumerate() {
+ let file_name = format!("test_{}.txt", i);
+ let path: Arc<Path> = Path::new(&file_name).into();
+ let file_path = root_path.join(&file_name);
- task.await.unwrap();
+ fs.insert_file(&file_path, "".into()).await;
- let bytes = fs.load_bytes(&file_path).await.unwrap();
+ let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc();
+ let text = text::Rope::from(case.text);
- let expected_bytes = vec![
- 0x82, 0xb1, // γ
- 0x82, 0xf1, // γ
- 0x82, 0xc9, // γ«
- 0x82, 0xbf, // γ‘
- 0x82, 0xcd, // γ―
- ];
+ let task = worktree.update(cx, |wt, cx| {
+ wt.write_file(
+ rel_path,
+ text,
+ text::LineEnding::Unix,
+ case.encoding,
+ case.has_bom,
+ cx,
+ )
+ });
+
+ if let Err(e) = task.await {
+ panic!("Unexpected error in case '{}': {:?}", case.name, e);
+ }
+
+ let bytes = fs.load_bytes(&file_path).await.unwrap();
- assert_eq!(bytes, expected_bytes, "Should be saved as Shift-JIS");
+ assert_eq!(
+ bytes, case.expected_bytes,
+ "case '{}' mismatch. Expected {:?}, but got {:?}",
+ case.name, case.expected_bytes, bytes
+ );
+ }
}