From 27376e051e6390fec6c9d9721fbab1aa33cd2a02 Mon Sep 17 00:00:00 2001 From: R Aadarsh Date: Fri, 5 Sep 2025 18:11:15 +0530 Subject: [PATCH] Write BOM once for UTF-16 files --- crates/fs/src/encodings.rs | 40 ++++++++++++++++++++------------------ crates/fs/src/fs.rs | 15 +++++++++++++- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/crates/fs/src/encodings.rs b/crates/fs/src/encodings.rs index 0a37c947c3e1e0be339888e68d49a9af64746b01..db75dcadad807874d7700a705d894ec31a24247a 100644 --- a/crates/fs/src/encodings.rs +++ b/crates/fs/src/encodings.rs @@ -38,8 +38,13 @@ impl EncodingWrapper { EncodingWrapper(encoding) } + pub fn get_encoding(&self) -> &'static Encoding { + self.0 + } + pub async fn decode(&self, input: Vec) -> Result { let (cow, _had_errors) = self.0.decode_with_bom_removal(&input); + // `encoding_rs` handles invalid bytes by replacing them with replacement characters // in the output string, so we return the result even if there were errors. // This preserves the original behaviour where files with invalid bytes could still be opened. @@ -48,30 +53,27 @@ impl EncodingWrapper { pub async fn encode(&self, input: String) -> Result> { if self.0 == encoding_rs::UTF_16BE { - let mut data: Vec = vec![]; - let utf = input.encode_utf16().collect::>(); - - for i in utf { - let byte = i.to_be_bytes(); - for b in byte { - data.push(b); - } - } + let mut data = Vec::::new(); + data.reserve(input.len() * 2); // Reserve space for UTF-16BE bytes + + // Convert the input string to UTF-16BE bytes + let utf16be_bytes: Vec = + input.encode_utf16().flat_map(|u| u.to_be_bytes()).collect(); + + data.extend(utf16be_bytes); return Ok(data); } else if self.0 == encoding_rs::UTF_16LE { - let mut data: Vec = vec![]; - let utf = input.encode_utf16().collect::>(); - - for i in utf { - let byte = i.to_le_bytes(); - for b in byte { - data.push(b); - } - } + let mut data = Vec::::new(); + data.reserve(input.len() * 2); // Reserve space for UTF-16LE bytes + + // Convert the input string to UTF-16LE bytes + let utf16le_bytes: Vec = + input.encode_utf16().flat_map(|u| u.to_le_bytes()).collect(); + + data.extend(utf16le_bytes); return Ok(data); } else { let (cow, _encoding_used, _had_errors) = self.0.encode(&input); - println!("Encoding: {:?}", self); // `encoding_rs` handles unencodable characters by replacing them with // appropriate substitutes in the output, so we return the result even if there were errors. // This maintains consistency with the decode behaviour. diff --git a/crates/fs/src/fs.rs b/crates/fs/src/fs.rs index fc010c8a2e04be5022a8402639e62aed7e7d6f7a..0857b049d07235736df2b1c708588134d7e978a8 100644 --- a/crates/fs/src/fs.rs +++ b/crates/fs/src/fs.rs @@ -694,11 +694,24 @@ impl Fs for RealFs { } let file = smol::fs::File::create(path).await?; let mut writer = smol::io::BufWriter::with_capacity(buffer_size, file); + + // BOM for UTF-16 is written at the start of the file here because + // if BOM is written in the `encode` function of `fs::encodings`, it would be written + // for every chunk, resulting in multiple BOMs in the file. + if encoding.get_encoding() == encoding_rs::UTF_16BE { + // Write BOM for UTF-16BE + writer.write_all(&[0xFE, 0xFF]).await?; + } else if encoding.get_encoding() == encoding_rs::UTF_16LE { + // Write BOM for UTF-16LE + writer.write_all(&[0xFF, 0xFE]).await?; + } + for chunk in chunks(text, line_ending) { writer .write_all(&from_utf8(chunk.to_string(), encoding.clone()).await?) - .await?; + .await? } + writer.flush().await?; Ok(()) }