lib.rs

  1use encoding_rs;
  2use std::{
  3    fmt::Debug,
  4    sync::{Arc, Mutex, atomic::AtomicBool},
  5};
  6
  7pub use encoding_rs::{
  8    BIG5, EUC_JP, EUC_KR, GB18030, GBK, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4,
  9    ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_8_I, ISO_8859_10, ISO_8859_13,
 10    ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, UTF_16BE,
 11    UTF_16LE, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254,
 12    WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, X_MAC_CYRILLIC,
 13};
 14
 15pub struct Encoding(Mutex<&'static encoding_rs::Encoding>);
 16
 17impl Debug for Encoding {
 18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 19        f.debug_tuple(&format!("Encoding{:?}", self.0))
 20            .field(&self.get().name())
 21            .finish()
 22    }
 23}
 24
 25impl Clone for Encoding {
 26    fn clone(&self) -> Self {
 27        Encoding(Mutex::new(self.get()))
 28    }
 29}
 30
 31impl Default for Encoding {
 32    fn default() -> Self {
 33        Encoding(Mutex::new(UTF_8))
 34    }
 35}
 36
 37impl From<&'static encoding_rs::Encoding> for Encoding {
 38    fn from(encoding: &'static encoding_rs::Encoding) -> Self {
 39        Encoding::new(encoding)
 40    }
 41}
 42
 43unsafe impl Send for Encoding {}
 44unsafe impl Sync for Encoding {}
 45
 46impl Encoding {
 47    pub fn new(encoding: &'static encoding_rs::Encoding) -> Self {
 48        Self(Mutex::new(encoding))
 49    }
 50
 51    pub fn set(&self, encoding: &'static encoding_rs::Encoding) {
 52        *self.0.lock().unwrap() = encoding;
 53    }
 54
 55    pub fn get(&self) -> &'static encoding_rs::Encoding {
 56        *self.0.lock().unwrap()
 57    }
 58
 59    pub async fn decode(
 60        &self,
 61        input: Vec<u8>,
 62        force: bool,
 63        detect_utf16: bool,
 64        buffer_encoding: Option<Arc<Encoding>>,
 65    ) -> anyhow::Result<String> {
 66        // Check if the input starts with a BOM for UTF-16 encodings only if detect_utf16 is true.
 67        if detect_utf16 {
 68            if let Some(encoding) = match input.get(..2) {
 69                Some([0xFF, 0xFE]) => Some(UTF_16LE),
 70                Some([0xFE, 0xFF]) => Some(UTF_16BE),
 71                _ => None,
 72            } {
 73                self.set(encoding);
 74
 75                if let Some(v) = buffer_encoding {
 76                    v.set(encoding)
 77                }
 78            }
 79        }
 80
 81        let (cow, had_errors) = self.get().decode_with_bom_removal(&input);
 82
 83        if force {
 84            return Ok(cow.to_string());
 85        }
 86
 87        if !had_errors {
 88            Ok(cow.to_string())
 89        } else {
 90            Err(anyhow::anyhow!(
 91                "The file contains invalid bytes for the specified encoding: {}.\nThis usually means that the file is not a regular text file, or is encoded in a different encoding.\nContinuing to open it may result in data loss if saved.",
 92                self.get().name()
 93            ))
 94        }
 95    }
 96
 97    pub async fn encode(&self, input: String) -> anyhow::Result<Vec<u8>> {
 98        if self.get() == UTF_16BE {
 99            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
100
101            // Convert the input string to UTF-16BE bytes
102            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
103
104            data.extend(utf16be_bytes);
105            return Ok(data);
106        } else if self.get() == UTF_16LE {
107            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
108
109            // Convert the input string to UTF-16LE bytes
110            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
111
112            data.extend(utf16le_bytes);
113            return Ok(data);
114        } else {
115            let (cow, _encoding_used, _had_errors) = self.get().encode(&input);
116
117            Ok(cow.into_owned())
118        }
119    }
120
121    pub fn reset(&self) {
122        self.set(UTF_8);
123    }
124}
125
126/// Convert a byte vector from a specified encoding to a UTF-8 string.
127pub async fn to_utf8(
128    input: Vec<u8>,
129    options: &EncodingOptions,
130    buffer_encoding: Option<Arc<Encoding>>,
131) -> anyhow::Result<String> {
132    options
133        .encoding
134        .decode(
135            input,
136            options.force.load(std::sync::atomic::Ordering::Acquire),
137            options
138                .detect_utf16
139                .load(std::sync::atomic::Ordering::Acquire),
140            buffer_encoding,
141        )
142        .await
143}
144
145/// Convert a UTF-8 string to a byte vector in a specified encoding.
146pub async fn from_utf8(input: String, target: Encoding) -> anyhow::Result<Vec<u8>> {
147    target.encode(input).await
148}
149
150pub struct EncodingOptions {
151    pub encoding: Arc<Encoding>,
152    pub force: AtomicBool,
153    pub detect_utf16: AtomicBool,
154}
155
156impl EncodingOptions {
157    pub fn reset(&self) {
158        self.encoding.reset();
159
160        self.force
161            .store(false, std::sync::atomic::Ordering::Release);
162
163        self.detect_utf16
164            .store(true, std::sync::atomic::Ordering::Release);
165    }
166}
167
168impl Default for EncodingOptions {
169    fn default() -> Self {
170        EncodingOptions {
171            encoding: Arc::new(Encoding::default()),
172            force: AtomicBool::new(false),
173            detect_utf16: AtomicBool::new(true),
174        }
175    }
176}
177
178impl Clone for EncodingOptions {
179    fn clone(&self) -> Self {
180        EncodingOptions {
181            encoding: Arc::new(self.encoding.get().into()),
182            force: AtomicBool::new(self.force.load(std::sync::atomic::Ordering::Acquire)),
183            detect_utf16: AtomicBool::new(
184                self.detect_utf16.load(std::sync::atomic::Ordering::Acquire),
185            ),
186        }
187    }
188}