encodings.rs

  1//! Encoding and decoding utilities using the `encoding_rs` crate.
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex},
  5};
  6
  7use std::sync::atomic::AtomicBool;
  8
  9use anyhow::Result;
 10use encoding_rs::Encoding;
 11
 12/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 13/// Since the reference is static, it is safe to send it across threads.
 14#[derive(Copy)]
 15pub struct EncodingWrapper(pub &'static Encoding);
 16
 17impl Debug for EncodingWrapper {
 18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 19        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
 20            .field(&self.0.name())
 21            .finish()
 22    }
 23}
 24
 25impl Default for EncodingWrapper {
 26    fn default() -> Self {
 27        EncodingWrapper(encoding_rs::UTF_8)
 28    }
 29}
 30
 31impl PartialEq for EncodingWrapper {
 32    fn eq(&self, other: &Self) -> bool {
 33        self.0.name() == other.0.name()
 34    }
 35}
 36
 37unsafe impl Send for EncodingWrapper {}
 38unsafe impl Sync for EncodingWrapper {}
 39
 40impl Clone for EncodingWrapper {
 41    fn clone(&self) -> Self {
 42        EncodingWrapper(self.0)
 43    }
 44}
 45
 46impl EncodingWrapper {
 47    pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
 48        EncodingWrapper(encoding)
 49    }
 50
 51    pub fn get_encoding(&self) -> &'static Encoding {
 52        self.0
 53    }
 54
 55    pub async fn decode(
 56        &mut self,
 57        input: Vec<u8>,
 58        force: bool,
 59        detect_utf16: bool,
 60        buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
 61    ) -> Result<String> {
 62        // Check if the input starts with a BOM for UTF-16 encodings only if detect_utf16 is true.
 63        if detect_utf16 {
 64            if let Some(encoding) = match input.get(..2) {
 65                Some([0xFF, 0xFE]) => Some(encoding_rs::UTF_16LE),
 66                Some([0xFE, 0xFF]) => Some(encoding_rs::UTF_16BE),
 67                _ => None,
 68            } {
 69                self.0 = encoding;
 70
 71                if let Some(v) = buffer_encoding
 72                    && let Ok(mut v) = v.lock()
 73                {
 74                    *v = encoding;
 75                }
 76            }
 77        }
 78
 79        let (cow, had_errors) = self.0.decode_with_bom_removal(&input);
 80
 81        if force {
 82            return Ok(cow.to_string());
 83        }
 84
 85        if !had_errors {
 86            Ok(cow.to_string())
 87        } else {
 88            Err(anyhow::anyhow!(
 89                "The file contains invalid bytes for the specified encoding: {}.\nThis usually means that the file is not a regular text file, or is encoded in a different encoding.\nContinuing to open it may result in data loss if saved.",
 90                self.0.name()
 91            ))
 92        }
 93    }
 94
 95    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
 96        if self.0 == encoding_rs::UTF_16BE {
 97            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 98
 99            // Convert the input string to UTF-16BE bytes
100            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
101
102            data.extend(utf16be_bytes);
103            return Ok(data);
104        } else if self.0 == encoding_rs::UTF_16LE {
105            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
106
107            // Convert the input string to UTF-16LE bytes
108            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
109
110            data.extend(utf16le_bytes);
111            return Ok(data);
112        } else {
113            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
114
115            Ok(cow.into_owned())
116        }
117    }
118}
119
120/// Convert a byte vector from a specified encoding to a UTF-8 string.
121pub async fn to_utf8(
122    input: Vec<u8>,
123    mut encoding: EncodingWrapper,
124    force: bool,
125    detect_utf16: bool,
126    buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
127) -> Result<String> {
128    encoding
129        .decode(input, force, detect_utf16, buffer_encoding)
130        .await
131}
132
133/// Convert a UTF-8 string to a byte vector in a specified encoding.
134pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
135    target.encode(input).await
136}
137
138pub struct EncodingOptions {
139    pub encoding: Arc<Mutex<EncodingWrapper>>,
140    pub force: AtomicBool,
141    pub detect_utf16: AtomicBool,
142}
143
144impl Default for EncodingOptions {
145    fn default() -> Self {
146        EncodingOptions {
147            encoding: Arc::new(Mutex::new(EncodingWrapper::default())),
148            force: AtomicBool::new(false),
149            detect_utf16: AtomicBool::new(true),
150        }
151    }
152}