Write BOM once for UTF-16 files

R Aadarsh created

Change summary

crates/fs/src/encodings.rs | 40 +++++++++++++++++++++-------------------
crates/fs/src/fs.rs        | 15 ++++++++++++++-
2 files changed, 35 insertions(+), 20 deletions(-)

Detailed changes

crates/fs/src/encodings.rs 🔗

@@ -38,8 +38,13 @@ impl EncodingWrapper {
         EncodingWrapper(encoding)
     }
 
+    pub fn get_encoding(&self) -> &'static Encoding {
+        self.0
+    }
+
     pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
         let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
+
         // `encoding_rs` handles invalid bytes by replacing them with replacement characters
         // in the output string, so we return the result even if there were errors.
         // This preserves the original behaviour where files with invalid bytes could still be opened.
@@ -48,30 +53,27 @@ impl EncodingWrapper {
 
     pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
         if self.0 == encoding_rs::UTF_16BE {
-            let mut data: Vec<u8> = vec![];
-            let utf = input.encode_utf16().collect::<Vec<u16>>();
-
-            for i in utf {
-                let byte = i.to_be_bytes();
-                for b in byte {
-                    data.push(b);
-                }
-            }
+            let mut data = Vec::<u8>::new();
+            data.reserve(input.len() * 2); // Reserve space for UTF-16BE bytes
+
+            // Convert the input string to UTF-16BE bytes
+            let utf16be_bytes: Vec<u8> =
+                input.encode_utf16().flat_map(|u| u.to_be_bytes()).collect();
+
+            data.extend(utf16be_bytes);
             return Ok(data);
         } else if self.0 == encoding_rs::UTF_16LE {
-            let mut data: Vec<u8> = vec![];
-            let utf = input.encode_utf16().collect::<Vec<u16>>();
-
-            for i in utf {
-                let byte = i.to_le_bytes();
-                for b in byte {
-                    data.push(b);
-                }
-            }
+            let mut data = Vec::<u8>::new();
+            data.reserve(input.len() * 2); // Reserve space for UTF-16LE bytes
+
+            // Convert the input string to UTF-16LE bytes
+            let utf16le_bytes: Vec<u8> =
+                input.encode_utf16().flat_map(|u| u.to_le_bytes()).collect();
+
+            data.extend(utf16le_bytes);
             return Ok(data);
         } else {
             let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
-            println!("Encoding: {:?}", self);
             // `encoding_rs` handles unencodable characters by replacing them with
             // appropriate substitutes in the output, so we return the result even if there were errors.
             // This maintains consistency with the decode behaviour.

crates/fs/src/fs.rs 🔗

@@ -694,11 +694,24 @@ impl Fs for RealFs {
         }
         let file = smol::fs::File::create(path).await?;
         let mut writer = smol::io::BufWriter::with_capacity(buffer_size, file);
+
+        // BOM for UTF-16 is written at the start of the file here because
+        // if BOM is written in the `encode` function of `fs::encodings`, it would be written
+        // for every chunk, resulting in multiple BOMs in the file.
+        if encoding.get_encoding() == encoding_rs::UTF_16BE {
+            // Write BOM for UTF-16BE
+            writer.write_all(&[0xFE, 0xFF]).await?;
+        } else if encoding.get_encoding() == encoding_rs::UTF_16LE {
+            // Write BOM for UTF-16LE
+            writer.write_all(&[0xFF, 0xFE]).await?;
+        }
+
         for chunk in chunks(text, line_ending) {
             writer
                 .write_all(&from_utf8(chunk.to_string(), encoding.clone()).await?)
-                .await?;
+                .await?
         }
+
         writer.flush().await?;
         Ok(())
     }