encodings.rs

  1//! Encoding and decoding utilities using the `encoding_rs` crate.
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex},
  5};
  6
  7use anyhow::Result;
  8use encoding_rs::Encoding;
  9
 10/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 11/// Since the reference is static, it is safe to send it across threads.
 12pub struct EncodingWrapper(&'static Encoding);
 13
 14impl Debug for EncodingWrapper {
 15    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 16        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
 17            .field(&self.0.name())
 18            .finish()
 19    }
 20}
 21
 22pub struct EncodingWrapperVisitor;
 23
 24impl PartialEq for EncodingWrapper {
 25    fn eq(&self, other: &Self) -> bool {
 26        self.0.name() == other.0.name()
 27    }
 28}
 29
 30unsafe impl Send for EncodingWrapper {}
 31unsafe impl Sync for EncodingWrapper {}
 32
 33impl Clone for EncodingWrapper {
 34    fn clone(&self) -> Self {
 35        EncodingWrapper(self.0)
 36    }
 37}
 38
 39impl EncodingWrapper {
 40    pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
 41        EncodingWrapper(encoding)
 42    }
 43
 44    pub fn get_encoding(&self) -> &'static Encoding {
 45        self.0
 46    }
 47
 48    pub async fn decode(
 49        &mut self,
 50        input: Vec<u8>,
 51        force: bool,
 52        buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
 53    ) -> Result<String> {
 54        // Check if the input starts with a BOM for UTF-16 encodings only if not forced to
 55        // use the encoding specified.
 56        if !force {
 57            if input.len() >= 2 {
 58                if (input[0] == 0xFF) & (input[1] == 0xFE) {
 59                    self.0 = encoding_rs::UTF_16LE;
 60
 61                    if let Some(v) = buffer_encoding {
 62                        if let Ok(mut v) = (*v).lock() {
 63                            *v = encoding_rs::UTF_16LE;
 64                        }
 65                    }
 66                } else if (input.len() >= 2) & (input[0] == 0xFE) & (input[1] == 0xFF) {
 67                    self.0 = encoding_rs::UTF_16BE;
 68
 69                    if let Some(v) = buffer_encoding {
 70                        if let Ok(mut v) = (*v).lock() {
 71                            *v = encoding_rs::UTF_16BE;
 72                        }
 73                    }
 74                }
 75            }
 76        }
 77
 78        let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
 79
 80        // `encoding_rs` handles invalid bytes by replacing them with replacement characters
 81        // in the output string, so we return the result even if there were errors.
 82        // This preserves the original behaviour where files with invalid bytes could still be opened.
 83        Ok(cow.into_owned())
 84    }
 85
 86    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
 87        if self.0 == encoding_rs::UTF_16BE {
 88            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 89
 90            // Convert the input string to UTF-16BE bytes
 91            let utf16be_bytes: Vec<u8> =
 92                input.encode_utf16().flat_map(|u| u.to_be_bytes()).collect();
 93
 94            data.extend(utf16be_bytes);
 95            return Ok(data);
 96        } else if self.0 == encoding_rs::UTF_16LE {
 97            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 98
 99            // Convert the input string to UTF-16LE bytes
100            let utf16le_bytes: Vec<u8> =
101                input.encode_utf16().flat_map(|u| u.to_le_bytes()).collect();
102
103            data.extend(utf16le_bytes);
104            return Ok(data);
105        } else {
106            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
107            // `encoding_rs` handles unencodable characters by replacing them with
108            // appropriate substitutes in the output, so we return the result even if there were errors.
109            // This maintains consistency with the decode behaviour.
110            Ok(cow.into_owned())
111        }
112    }
113}
114
115/// Convert a byte vector from a specified encoding to a UTF-8 string.
116pub async fn to_utf8(
117    input: Vec<u8>,
118    mut encoding: EncodingWrapper,
119    force: bool,
120    buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
121) -> Result<String> {
122    encoding.decode(input, force, buffer_encoding).await
123}
124
125/// Convert a UTF-8 string to a byte vector in a specified encoding.
126pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
127    target.encode(input).await
128}