encodings.rs

  1//! Encoding and decoding utilities using the `encoding_rs` crate.
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex},
  5};
  6
  7use std::sync::atomic::AtomicBool;
  8
  9use anyhow::Result;
 10use encoding_rs::Encoding;
 11
 12/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 13/// Since the reference is static, it is safe to send it across threads.
 14pub struct EncodingWrapper(pub &'static Encoding);
 15
 16impl Debug for EncodingWrapper {
 17    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 18        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
 19            .field(&self.0.name())
 20            .finish()
 21    }
 22}
 23
 24impl Default for EncodingWrapper {
 25    fn default() -> Self {
 26        EncodingWrapper(encoding_rs::UTF_8)
 27    }
 28}
 29
 30impl PartialEq for EncodingWrapper {
 31    fn eq(&self, other: &Self) -> bool {
 32        self.0.name() == other.0.name()
 33    }
 34}
 35
 36unsafe impl Send for EncodingWrapper {}
 37unsafe impl Sync for EncodingWrapper {}
 38
 39impl Clone for EncodingWrapper {
 40    fn clone(&self) -> Self {
 41        EncodingWrapper(self.0)
 42    }
 43}
 44
 45impl EncodingWrapper {
 46    pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
 47        EncodingWrapper(encoding)
 48    }
 49
 50    pub fn get_encoding(&self) -> &'static Encoding {
 51        self.0
 52    }
 53
 54    pub async fn decode(
 55        &mut self,
 56        input: Vec<u8>,
 57        force: bool,
 58        detect_utf16: bool,
 59        buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
 60    ) -> Result<String> {
 61        // Check if the input starts with a BOM for UTF-16 encodings only if detect_utf16 is true.
 62        println!("{}", force);
 63        println!("{}", detect_utf16);
 64        if detect_utf16 {
 65            if let Some(encoding) = match input.get(..2) {
 66                Some([0xFF, 0xFE]) => Some(encoding_rs::UTF_16LE),
 67                Some([0xFE, 0xFF]) => Some(encoding_rs::UTF_16BE),
 68                _ => None,
 69            } {
 70                self.0 = encoding;
 71
 72                if let Some(v) = buffer_encoding
 73                    && let Ok(mut v) = v.lock()
 74                {
 75                    *v = encoding;
 76                }
 77            }
 78        }
 79
 80        let (cow, had_errors) = self.0.decode_with_bom_removal(&input);
 81
 82        if force {
 83            return Ok(cow.to_string());
 84        }
 85
 86        if !had_errors {
 87            Ok(cow.to_string())
 88        } else {
 89            Err(anyhow::anyhow!(
 90                "The file contains invalid bytes for the specified encoding: {}.\nThis usually means that the file is not a regular text file, or is encoded in a different encoding.\nContinuing to open it may result in data loss if saved.",
 91                self.0.name()
 92            ))
 93        }
 94    }
 95
 96    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
 97        if self.0 == encoding_rs::UTF_16BE {
 98            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 99
100            // Convert the input string to UTF-16BE bytes
101            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
102
103            data.extend(utf16be_bytes);
104            return Ok(data);
105        } else if self.0 == encoding_rs::UTF_16LE {
106            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
107
108            // Convert the input string to UTF-16LE bytes
109            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
110
111            data.extend(utf16le_bytes);
112            return Ok(data);
113        } else {
114            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
115
116            Ok(cow.into_owned())
117        }
118    }
119}
120
121/// Convert a byte vector from a specified encoding to a UTF-8 string.
122pub async fn to_utf8(
123    input: Vec<u8>,
124    mut encoding: EncodingWrapper,
125    force: bool,
126    detect_utf16: bool,
127    buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
128) -> Result<String> {
129    encoding
130        .decode(input, force, detect_utf16, buffer_encoding)
131        .await
132}
133
134/// Convert a UTF-8 string to a byte vector in a specified encoding.
135pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
136    target.encode(input).await
137}
138
139pub struct EncodingOptions {
140    pub encoding: Arc<Mutex<EncodingWrapper>>,
141    pub force: AtomicBool,
142    pub detect_utf16: AtomicBool,
143}
144
145impl Default for EncodingOptions {
146    fn default() -> Self {
147        EncodingOptions {
148            encoding: Arc::new(Mutex::new(EncodingWrapper::default())),
149            force: AtomicBool::new(false),
150            detect_utf16: AtomicBool::new(true),
151        }
152    }
153}