diff --git a/crates/language/src/buffer.rs b/crates/language/src/buffer.rs index 99e0c8d4ebdad709eea0e9ab6dbdf9d889d54ec5..5f46340b41a876443f1d12724450d2d8b30f9b33 100644 --- a/crates/language/src/buffer.rs +++ b/crates/language/src/buffer.rs @@ -1490,19 +1490,23 @@ impl Buffer { let (tx, rx) = futures::channel::oneshot::channel(); let prev_version = self.text.version(); self.reload_task = Some(cx.spawn(async move |this, cx| { - let Some((new_mtime, new_text)) = this.update(cx, |this, cx| { + let Some((new_mtime, load_bytes_task, encoding)) = this.update(cx, |this, cx| { let file = this.file.as_ref()?.as_local()?; - - Some((file.disk_state().mtime(), file.load(cx))) + Some(( + file.disk_state().mtime(), + file.load_bytes(cx), + this.encoding, + )) })? else { return Ok(()); }; - let new_text = new_text.await?; - let diff = this - .update(cx, |this, cx| this.diff(new_text.clone(), cx))? - .await; + let bytes = load_bytes_task.await?; + let (cow, _encoding_used, _has_errors) = encoding.decode(&bytes); + let new_text = cow.into_owned(); + + let diff = this.update(cx, |this, cx| this.diff(new_text, cx))?.await; this.update(cx, |this, cx| { if this.version() == diff.base_version { this.finalize_last_transaction(); diff --git a/crates/worktree/src/worktree.rs b/crates/worktree/src/worktree.rs index 7145bccd514fbb5d6093efda765a826162c91260..f5f632e65d71b683d1a491b1fc9e9a612f5c24a5 100644 --- a/crates/worktree/src/worktree.rs +++ b/crates/worktree/src/worktree.rs @@ -1361,7 +1361,7 @@ impl LocalWorktree { } let content = fs.load_bytes(&abs_path).await?; - let (text, encoding, has_bom) = decode_byte(content); + let (text, encoding, has_bom) = decode_byte(content)?; let worktree = this.upgrade().context("worktree was dropped")?; let file = match entry.await? { @@ -1489,25 +1489,12 @@ impl LocalWorktree { let fs = fs.clone(); let abs_path = abs_path.clone(); async move { - let bom_bytes = if has_bom { - if encoding == encoding_rs::UTF_16LE { - vec![0xFF, 0xFE] - } else if encoding == encoding_rs::UTF_16BE { - vec![0xFE, 0xFF] - } else if encoding == encoding_rs::UTF_8 { - vec![0xEF, 0xBB, 0xBF] - } else { - vec![] - } - } else { - vec![] - }; - // For UTF-8, use the optimized `fs.save` which writes Rope chunks directly to disk // without allocating a contiguous string. if encoding == encoding_rs::UTF_8 && !has_bom { return fs.save(&abs_path, &text, line_ending).await; } + // For legacy encodings (e.g. Shift-JIS), we fall back to converting the entire Rope // to a String/Bytes in memory before writing. // @@ -1520,13 +1507,45 @@ impl LocalWorktree { LineEnding::Windows => text_string.replace('\n', "\r\n"), }; - let (cow, _, _) = encoding.encode(&normalized_text); - let bytes = if !bom_bytes.is_empty() { - let mut bytes = bom_bytes; - bytes.extend_from_slice(&cow); - bytes.into() + // Create the byte vector manually for UTF-16 encodings because encoding_rs encodes to UTF-8 by default (per WHATWG standards), + // which is not what we want for saving files. + let bytes = if encoding == encoding_rs::UTF_16BE { + let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2); + if has_bom { + data.extend_from_slice(&[0xFE, 0xFF]); // BOM + } + let utf16be_bytes = + normalized_text.encode_utf16().flat_map(|u| u.to_be_bytes()); + data.extend(utf16be_bytes); + data.into() + } else if encoding == encoding_rs::UTF_16LE { + let mut data = Vec::with_capacity(normalized_text.len() * 2 + 2); + if has_bom { + data.extend_from_slice(&[0xFF, 0xFE]); // BOM + } + let utf16le_bytes = + normalized_text.encode_utf16().flat_map(|u| u.to_le_bytes()); + data.extend(utf16le_bytes); + data.into() } else { - cow + // For other encodings (Shift-JIS, UTF-8 with BOM, etc.), delegate to encoding_rs. + let bom_bytes = if has_bom { + if encoding == encoding_rs::UTF_8 { + vec![0xEF, 0xBB, 0xBF] + } else { + vec![] + } + } else { + vec![] + }; + let (cow, _, _) = encoding.encode(&normalized_text); + if !bom_bytes.is_empty() { + let mut bytes = bom_bytes; + bytes.extend_from_slice(&cow); + bytes.into() + } else { + cow + } }; fs.write(&abs_path, &bytes).await @@ -5842,11 +5861,28 @@ impl fs::Watcher for NullWatcher { } } -fn decode_byte(bytes: Vec) -> (String, &'static Encoding, bool) { +fn decode_byte(bytes: Vec) -> anyhow::Result<(String, &'static Encoding, bool)> { // check BOM if let Some((encoding, _bom_len)) = Encoding::for_bom(&bytes) { let (cow, _) = encoding.decode_with_bom_removal(&bytes); - return (cow.into_owned(), encoding, true); + return Ok((cow.into_owned(), encoding, true)); + } + + match analyze_byte_content(&bytes) { + ByteContent::Utf16Le => { + let encoding = encoding_rs::UTF_16LE; + let (cow, _, _) = encoding.decode(&bytes); + return Ok((cow.into_owned(), encoding, false)); + } + ByteContent::Utf16Be => { + let encoding = encoding_rs::UTF_16BE; + let (cow, _, _) = encoding.decode(&bytes); + return Ok((cow.into_owned(), encoding, false)); + } + ByteContent::Binary => { + anyhow::bail!("Binary files are not supported"); + } + ByteContent::Unknown => {} } fn detect_encoding(bytes: Vec) -> (String, &'static Encoding) { @@ -5867,14 +5903,66 @@ fn decode_byte(bytes: Vec) -> (String, &'static Encoding, bool) { // displaying raw escape sequences instead of the correct characters. if text.contains('\x1b') { let (s, enc) = detect_encoding(text.into_bytes()); - (s, enc, false) + Ok((s, enc, false)) } else { - (text, encoding_rs::UTF_8, false) + Ok((text, encoding_rs::UTF_8, false)) } } Err(e) => { let (s, enc) = detect_encoding(e.into_bytes()); - (s, enc, false) + Ok((s, enc, false)) } } } + +#[derive(PartialEq)] +enum ByteContent { + Utf16Le, + Utf16Be, + Binary, + Unknown, +} +// Heuristic check using null byte distribution. +// NOTE: This relies on the presence of ASCII characters (which become `0x00` in UTF-16). +// Files consisting purely of non-ASCII characters (like Japanese) may not be detected here +// and will result in `Unknown`. +fn analyze_byte_content(bytes: &[u8]) -> ByteContent { + if bytes.len() < 2 { + return ByteContent::Unknown; + } + + let check_len = bytes.len().min(1024); + let sample = &bytes[..check_len]; + + if !sample.contains(&0) { + return ByteContent::Unknown; + } + + let mut even_nulls = 0; + let mut odd_nulls = 0; + + for (i, &byte) in sample.iter().enumerate() { + if byte == 0 { + if i % 2 == 0 { + even_nulls += 1; + } else { + odd_nulls += 1; + } + } + } + + let total_nulls = even_nulls + odd_nulls; + if total_nulls < check_len / 10 { + return ByteContent::Unknown; + } + + if even_nulls > odd_nulls * 4 { + return ByteContent::Utf16Be; + } + + if odd_nulls > even_nulls * 4 { + return ByteContent::Utf16Le; + } + + ByteContent::Binary +} diff --git a/crates/worktree/src/worktree_tests.rs b/crates/worktree/src/worktree_tests.rs index 094a6d52ea4168752578eab06cea511a57e65c10..45d39710c6ea825aded4d29f447124ee4c2ecb33 100644 --- a/crates/worktree/src/worktree_tests.rs +++ b/crates/worktree/src/worktree_tests.rs @@ -1,5 +1,5 @@ use crate::{Entry, EntryKind, Event, PathChange, Worktree, WorktreeModelHandle}; -use anyhow::{Context as _, Result}; +use anyhow::Result; use encoding_rs; use fs::{FakeFs, Fs, RealFs, RemoveOptions}; use git::{DOT_GIT, GITIGNORE, REPO_EXCLUDE}; @@ -2568,71 +2568,87 @@ fn init_test(cx: &mut gpui::TestAppContext) { #[gpui::test] async fn test_load_file_encoding(cx: &mut TestAppContext) { init_test(cx); - let test_cases: Vec<(&str, &[u8], &str)> = vec![ - ("utf8.txt", "こんにちは".as_bytes(), "こんにちは"), // "こんにちは" is Japanese "Hello" - ( - "sjis.txt", - &[0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd], - "こんにちは", - ), - ( - "eucjp.txt", - &[0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf], - "こんにちは", - ), - ( - "iso2022jp.txt", - &[ + + struct TestCase { + name: &'static str, + bytes: Vec, + expected_text: &'static str, + } + + // --- Success Cases --- + let success_cases = vec![ + TestCase { + name: "utf8.txt", + bytes: "こんにちは".as_bytes().to_vec(), + expected_text: "こんにちは", + }, + TestCase { + name: "sjis.txt", + bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd], + expected_text: "こんにちは", + }, + TestCase { + name: "eucjp.txt", + bytes: vec![0xa4, 0xb3, 0xa4, 0xf3, 0xa4, 0xcb, 0xa4, 0xc1, 0xa4, 0xcf], + expected_text: "こんにちは", + }, + TestCase { + name: "iso2022jp.txt", + bytes: vec![ 0x1b, 0x24, 0x42, 0x24, 0x33, 0x24, 0x73, 0x24, 0x4b, 0x24, 0x41, 0x24, 0x4f, 0x1b, 0x28, 0x42, ], - "こんにちは", - ), - // Western Europe (Windows-1252) - // "Café" -> 0xE9 is 'é' in Windows-1252 (it is typically 0xC3 0xA9 in UTF-8) - ("win1252.txt", &[0x43, 0x61, 0x66, 0xe9], "Café"), - // Chinese Simplified (GBK) - // Note: We use a slightly longer string here because short byte sequences can be ambiguous - // in multi-byte encodings. Providing more context helps the heuristic detector guess correctly. - // Text: "今天天气不错" (Today's weather is not bad / nice) - // Bytes: - // 今: BD F1 - // 天: CC EC - // 天: CC EC - // 气: C6 F8 - // 不: B2 BB - // 错: B4 ED - ( - "gbk.txt", - &[ + expected_text: "こんにちは", + }, + TestCase { + name: "win1252.txt", + bytes: vec![0x43, 0x61, 0x66, 0xe9], + expected_text: "Café", + }, + TestCase { + name: "gbk.txt", + bytes: vec![ 0xbd, 0xf1, 0xcc, 0xec, 0xcc, 0xec, 0xc6, 0xf8, 0xb2, 0xbb, 0xb4, 0xed, ], - "今天天气不错", - ), - ( - "utf16le_bom.txt", - &[ + expected_text: "今天天气不错", + }, + // UTF-16LE with BOM + TestCase { + name: "utf16le_bom.txt", + bytes: vec![ 0xFF, 0xFE, // BOM - 0x53, 0x30, // こ - 0x93, 0x30, // ん - 0x6B, 0x30, // に - 0x61, 0x30, // ち - 0x6F, 0x30, // は + 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, ], - "こんにちは", - ), - ( - "utf8_bom.txt", - &[ - 0xEF, 0xBB, 0xBF, // UTF-8 BOM - 0xE3, 0x81, 0x93, // こ - 0xE3, 0x82, 0x93, // ん - 0xE3, 0x81, 0xAB, // に - 0xE3, 0x81, 0xA1, // ち - 0xE3, 0x81, 0xAF, // は + expected_text: "こんにちは", + }, + // UTF-16BE with BOM + TestCase { + name: "utf16be_bom.txt", + bytes: vec![ + 0xFE, 0xFF, // BOM + 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, ], - "こんにちは", - ), + expected_text: "こんにちは", + }, + // UTF-16LE without BOM (ASCII only) + // This relies on the "null byte heuristic" we implemented. + // "ABC" -> 41 00 42 00 43 00 + TestCase { + name: "utf16le_ascii_no_bom.txt", + bytes: vec![0x41, 0x00, 0x42, 0x00, 0x43, 0x00], + expected_text: "ABC", + }, + ]; + + // --- Failure Cases --- + let failure_cases = vec![ + // Binary File (Should be detected by heuristic and return Error) + // Contains random bytes and mixed nulls that don't match UTF-16 patterns + TestCase { + name: "binary.bin", + bytes: vec![0x00, 0xFF, 0x12, 0x00, 0x99, 0x88, 0x77, 0x66, 0x00], + expected_text: "", // Not used + }, ]; let root_path = if cfg!(windows) { @@ -2642,15 +2658,11 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) { }; let fs = FakeFs::new(cx.background_executor.clone()); + fs.create_dir(root_path).await.unwrap(); - let mut files_json = serde_json::Map::new(); - for (name, _, _) in &test_cases { - files_json.insert(name.to_string(), serde_json::Value::String("".to_string())); - } - - for (name, bytes, _) in &test_cases { - let path = root_path.join(name); - fs.write(&path, bytes).await.unwrap(); + for case in success_cases.iter().chain(failure_cases.iter()) { + let path = root_path.join(case.name); + fs.write(&path, &case.bytes).await.unwrap(); } let tree = Worktree::local( @@ -2667,34 +2679,54 @@ async fn test_load_file_encoding(cx: &mut TestAppContext) { cx.read(|cx| tree.read(cx).as_local().unwrap().scan_complete()) .await; - for (name, _, expected) in test_cases { - let loaded = tree - .update(cx, |tree, cx| tree.load_file(rel_path(name), cx)) - .await - .with_context(|| format!("Failed to load {}", name)) - .unwrap(); + let rel_path = |name: &str| { + RelPath::new(&Path::new(name), PathStyle::local()) + .unwrap() + .into_arc() + }; + // Run Success Tests + for case in success_cases { + let loaded = tree + .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx)) + .await; + if let Err(e) = &loaded { + panic!("Failed to load success case '{}': {:?}", case.name, e); + } + let loaded = loaded.unwrap(); assert_eq!( - loaded.text, expected, + loaded.text, case.expected_text, "Encoding mismatch for file: {}", - name + case.name ); } + + // Run Failure Tests + for case in failure_cases { + let loaded = tree + .update(cx, |tree, cx| tree.load_file(&rel_path(case.name), cx)) + .await; + assert!( + loaded.is_err(), + "Failure case '{}' unexpectedly succeeded! It should have been detected as binary.", + case.name + ); + let err_msg = loaded.unwrap_err().to_string(); + println!("Got expected error for {}: {}", case.name, err_msg); + } } #[gpui::test] async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) { init_test(cx); let fs = FakeFs::new(cx.executor()); + let root_path = if cfg!(windows) { Path::new("C:\\root") } else { Path::new("/root") }; fs.create_dir(root_path).await.unwrap(); - let file_path = root_path.join("test.txt"); - - fs.insert_file(&file_path, "initial".into()).await; let worktree = Worktree::local( root_path, @@ -2707,33 +2739,107 @@ async fn test_write_file_encoding(cx: &mut gpui::TestAppContext) { .await .unwrap(); - let path: Arc = Path::new("test.txt").into(); - let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc(); + // Define test case structure + struct TestCase { + name: &'static str, + text: &'static str, + encoding: &'static encoding_rs::Encoding, + has_bom: bool, + expected_bytes: Vec, + } - let text = text::Rope::from("こんにちは"); + let cases = vec![ + // Shift_JIS with Japanese + TestCase { + name: "Shift_JIS with Japanese", + text: "こんにちは", + encoding: encoding_rs::SHIFT_JIS, + has_bom: false, + expected_bytes: vec![0x82, 0xb1, 0x82, 0xf1, 0x82, 0xc9, 0x82, 0xbf, 0x82, 0xcd], + }, + // UTF-8 No BOM + TestCase { + name: "UTF-8 No BOM", + text: "AB", + encoding: encoding_rs::UTF_8, + has_bom: false, + expected_bytes: vec![0x41, 0x42], + }, + // UTF-8 with BOM + TestCase { + name: "UTF-8 with BOM", + text: "AB", + encoding: encoding_rs::UTF_8, + has_bom: true, + expected_bytes: vec![0xEF, 0xBB, 0xBF, 0x41, 0x42], + }, + // UTF-16LE No BOM with Japanese + // NOTE: This passes thanks to the manual encoding fix implemented in `write_file`. + TestCase { + name: "UTF-16LE No BOM with Japanese", + text: "こんにちは", + encoding: encoding_rs::UTF_16LE, + has_bom: false, + expected_bytes: vec![0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f, 0x30], + }, + // UTF-16LE with BOM + TestCase { + name: "UTF-16LE with BOM", + text: "A", + encoding: encoding_rs::UTF_16LE, + has_bom: true, + expected_bytes: vec![0xFF, 0xFE, 0x41, 0x00], + }, + // UTF-16BE No BOM with Japanese + // NOTE: This passes thanks to the manual encoding fix. + TestCase { + name: "UTF-16BE No BOM with Japanese", + text: "こんにちは", + encoding: encoding_rs::UTF_16BE, + has_bom: false, + expected_bytes: vec![0x30, 0x53, 0x30, 0x93, 0x30, 0x6b, 0x30, 0x61, 0x30, 0x6f], + }, + // UTF-16BE with BOM + TestCase { + name: "UTF-16BE with BOM", + text: "A", + encoding: encoding_rs::UTF_16BE, + has_bom: true, + expected_bytes: vec![0xFE, 0xFF, 0x00, 0x41], + }, + ]; - let task = worktree.update(cx, |wt, cx| { - wt.write_file( - rel_path, - text, - text::LineEnding::Unix, - encoding_rs::SHIFT_JIS, - false, - cx, - ) - }); + for (i, case) in cases.into_iter().enumerate() { + let file_name = format!("test_{}.txt", i); + let path: Arc = Path::new(&file_name).into(); + let file_path = root_path.join(&file_name); - task.await.unwrap(); + fs.insert_file(&file_path, "".into()).await; - let bytes = fs.load_bytes(&file_path).await.unwrap(); + let rel_path = RelPath::new(&path, PathStyle::local()).unwrap().into_arc(); + let text = text::Rope::from(case.text); - let expected_bytes = vec![ - 0x82, 0xb1, // こ - 0x82, 0xf1, // ん - 0x82, 0xc9, // に - 0x82, 0xbf, // ち - 0x82, 0xcd, // は - ]; + let task = worktree.update(cx, |wt, cx| { + wt.write_file( + rel_path, + text, + text::LineEnding::Unix, + case.encoding, + case.has_bom, + cx, + ) + }); + + if let Err(e) = task.await { + panic!("Unexpected error in case '{}': {:?}", case.name, e); + } + + let bytes = fs.load_bytes(&file_path).await.unwrap(); - assert_eq!(bytes, expected_bytes, "Should be saved as Shift-JIS"); + assert_eq!( + bytes, case.expected_bytes, + "case '{}' mismatch. Expected {:?}, but got {:?}", + case.name, case.expected_bytes, bytes + ); + } }