encodings.rs

  1//! Encoding and decoding utilities using the `encoding_rs` crate.
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex},
  5};
  6
  7use anyhow::Result;
  8use encoding_rs::Encoding;
  9
 10/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 11/// Since the reference is static, it is safe to send it across threads.
 12pub struct EncodingWrapper(&'static Encoding);
 13
 14impl Debug for EncodingWrapper {
 15    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 16        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
 17            .field(&self.0.name())
 18            .finish()
 19    }
 20}
 21
 22impl Default for EncodingWrapper {
 23    fn default() -> Self {
 24        EncodingWrapper(encoding_rs::UTF_8)
 25    }
 26}
 27
 28pub struct EncodingWrapperVisitor;
 29
 30impl PartialEq for EncodingWrapper {
 31    fn eq(&self, other: &Self) -> bool {
 32        self.0.name() == other.0.name()
 33    }
 34}
 35
 36unsafe impl Send for EncodingWrapper {}
 37unsafe impl Sync for EncodingWrapper {}
 38
 39impl Clone for EncodingWrapper {
 40    fn clone(&self) -> Self {
 41        EncodingWrapper(self.0)
 42    }
 43}
 44
 45impl EncodingWrapper {
 46    pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
 47        EncodingWrapper(encoding)
 48    }
 49
 50    pub fn get_encoding(&self) -> &'static Encoding {
 51        self.0
 52    }
 53
 54    pub async fn decode(
 55        &mut self,
 56        input: Vec<u8>,
 57        force: bool,
 58        buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
 59    ) -> Result<String> {
 60        // Check if the input starts with a BOM for UTF-16 encodings only if not forced to
 61        // use the encoding specified.
 62        if !force {
 63            if let Some(encoding) = match input.get(..2) {
 64                Some([0xFF, 0xFE]) => Some(encoding_rs::UTF_16LE),
 65                Some([0xFE, 0xFF]) => Some(encoding_rs::UTF_16BE),
 66                _ => None,
 67            } {
 68                self.0 = encoding;
 69
 70                if let Some(v) = buffer_encoding {
 71                    if let Ok(mut v) = (*v).lock() {
 72                        *v = encoding;
 73                    }
 74                }
 75            }
 76        }
 77
 78        let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
 79
 80        if !_had_errors {
 81            Ok(cow.to_string())
 82        } else {
 83            // If there were decoding errors, return an error.
 84            Err(anyhow::anyhow!(
 85                "The file contains invalid bytes for the specified encoding: {}. This usually menas that the file is not a regular text file, or is encoded in a different encoding. Continuing to open it may result in data loss if saved.",
 86                self.0.name()
 87            ))
 88        }
 89    }
 90
 91    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
 92        if self.0 == encoding_rs::UTF_16BE {
 93            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 94
 95            // Convert the input string to UTF-16BE bytes
 96            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
 97
 98            data.extend(utf16be_bytes);
 99            return Ok(data);
100        } else if self.0 == encoding_rs::UTF_16LE {
101            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
102
103            // Convert the input string to UTF-16LE bytes
104            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
105
106            data.extend(utf16le_bytes);
107            return Ok(data);
108        } else {
109            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
110            // `encoding_rs` handles unencodable characters by replacing them with
111            // appropriate substitutes in the output, so we return the result even if there were errors.
112            // This maintains consistency with the decode behaviour.
113            Ok(cow.into_owned())
114        }
115    }
116}
117
118/// Convert a byte vector from a specified encoding to a UTF-8 string.
119pub async fn to_utf8(
120    input: Vec<u8>,
121    mut encoding: EncodingWrapper,
122    force: bool,
123    buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
124) -> Result<String> {
125    encoding.decode(input, force, buffer_encoding).await
126}
127
128/// Convert a UTF-8 string to a byte vector in a specified encoding.
129pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
130    target.encode(input).await
131}