lib.rs

  1use encoding_rs;
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex, atomic::AtomicBool},
  5};
  6
  7pub use encoding_rs::{
  8    BIG5, EUC_JP, EUC_KR, GB18030, GBK, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4,
  9    ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_8_I, ISO_8859_10, ISO_8859_13,
 10    ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, UTF_16BE,
 11    UTF_16LE, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254,
 12    WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, X_MAC_CYRILLIC,
 13};
 14
 15pub struct Encoding(Mutex<&'static encoding_rs::Encoding>);
 16
 17impl Debug for Encoding {
 18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 19        f.debug_tuple(&format!("Encoding{:?}", self.0))
 20            .field(&self.get().name())
 21            .finish()
 22    }
 23}
 24
 25impl Default for Encoding {
 26    fn default() -> Self {
 27        Encoding(Mutex::new(UTF_8))
 28    }
 29}
 30
 31unsafe impl Send for Encoding {}
 32unsafe impl Sync for Encoding {}
 33
 34impl Encoding {
 35    pub fn new(encoding: &'static encoding_rs::Encoding) -> Self {
 36        Self(Mutex::new(encoding))
 37    }
 38
 39    pub fn set(&self, encoding: &'static encoding_rs::Encoding) {
 40        *self.0.lock().unwrap() = encoding;
 41    }
 42
 43    pub fn get(&self) -> &'static encoding_rs::Encoding {
 44        *self.0.lock().unwrap()
 45    }
 46
 47    pub async fn decode(
 48        &self,
 49        input: Vec<u8>,
 50        force: bool,
 51        detect_utf16: bool,
 52        buffer_encoding: Option<Arc<Encoding>>,
 53    ) -> anyhow::Result<String> {
 54        // Check if the input starts with a BOM for UTF-16 encodings only if detect_utf16 is true.
 55        if detect_utf16 {
 56            if let Some(encoding) = match input.get(..2) {
 57                Some([0xFF, 0xFE]) => Some(UTF_16LE),
 58                Some([0xFE, 0xFF]) => Some(UTF_16BE),
 59                _ => None,
 60            } {
 61                self.set(encoding);
 62
 63                if let Some(v) = buffer_encoding {
 64                    v.set(encoding)
 65                }
 66            }
 67        }
 68
 69        let (cow, had_errors) = self.get().decode_with_bom_removal(&input);
 70
 71        if force {
 72            return Ok(cow.to_string());
 73        }
 74
 75        if !had_errors {
 76            Ok(cow.to_string())
 77        } else {
 78            Err(anyhow::anyhow!(
 79                "The file contains invalid bytes for the specified encoding: {}.\nThis usually means that the file is not a regular text file, or is encoded in a different encoding.\nContinuing to open it may result in data loss if saved.",
 80                self.get().name()
 81            ))
 82        }
 83    }
 84
 85    pub async fn encode(&self, input: String) -> anyhow::Result<Vec<u8>> {
 86        if self.get() == UTF_16BE {
 87            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 88
 89            // Convert the input string to UTF-16BE bytes
 90            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
 91
 92            data.extend(utf16be_bytes);
 93            return Ok(data);
 94        } else if self.get() == UTF_16LE {
 95            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 96
 97            // Convert the input string to UTF-16LE bytes
 98            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
 99
100            data.extend(utf16le_bytes);
101            return Ok(data);
102        } else {
103            let (cow, _encoding_used, _had_errors) = self.get().encode(&input);
104
105            Ok(cow.into_owned())
106        }
107    }
108
109    pub fn reset(&self) {
110        self.set(UTF_8);
111    }
112}
113
114/// Convert a byte vector from a specified encoding to a UTF-8 string.
115pub async fn to_utf8(
116    input: Vec<u8>,
117    encoding: Encoding,
118    force: bool,
119    detect_utf16: bool,
120    buffer_encoding: Option<Arc<Encoding>>,
121) -> anyhow::Result<String> {
122    encoding
123        .decode(input, force, detect_utf16, buffer_encoding)
124        .await
125}
126
127/// Convert a UTF-8 string to a byte vector in a specified encoding.
128pub async fn from_utf8(input: String, target: Encoding) -> anyhow::Result<Vec<u8>> {
129    target.encode(input).await
130}
131
132pub struct EncodingOptions {
133    pub encoding: Arc<Mutex<Encoding>>,
134    pub force: AtomicBool,
135    pub detect_utf16: AtomicBool,
136}
137
138impl EncodingOptions {
139    pub fn reset(&mut self) {
140        self.encoding.lock().unwrap().reset();
141        *self.force.get_mut() = false;
142        *self.detect_utf16.get_mut() = true;
143    }
144}
145
146impl Default for EncodingOptions {
147    fn default() -> Self {
148        EncodingOptions {
149            encoding: Arc::new(Mutex::new(Encoding::default())),
150            force: AtomicBool::new(false),
151            detect_utf16: AtomicBool::new(true),
152        }
153    }
154}