encodings.rs

  1use encoding_rs;
  2use std::{borrow::Cow, fmt::Debug};
  3
  4pub use encoding_rs::{
  5    BIG5, EUC_JP, EUC_KR, GB18030, GBK, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4,
  6    ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_8_I, ISO_8859_10, ISO_8859_13,
  7    ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, UTF_16BE,
  8    UTF_16LE, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254,
  9    WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, X_MAC_CYRILLIC,
 10};
 11
 12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 13pub struct Encoding {
 14    pub encoding: &'static encoding_rs::Encoding,
 15    pub with_bom: bool,
 16}
 17
 18impl Default for Encoding {
 19    fn default() -> Self {
 20        Encoding {
 21            encoding: UTF_8,
 22            with_bom: false,
 23        }
 24    }
 25}
 26
 27impl Encoding {
 28    pub fn decode(&self, input: Vec<u8>) -> anyhow::Result<String> {
 29        if self.encoding == UTF_8 && !self.with_bom {
 30            return Ok(String::from_utf8(input)?);
 31        }
 32        let Some(result) = self
 33            .encoding
 34            .decode_without_bom_handling_and_without_replacement(&input)
 35        else {
 36            return Err(anyhow::anyhow!(
 37                "input is not valid {}",
 38                self.encoding.name()
 39            ));
 40        };
 41
 42        if self.with_bom && result.starts_with("\u{FEFF}") {
 43            Ok(result[3..].to_string())
 44        } else {
 45            Ok(result.into_owned())
 46        }
 47    }
 48
 49    pub fn bom(&self) -> Option<&'static [u8]> {
 50        if !self.with_bom {
 51            return None;
 52        }
 53        if self.encoding == UTF_8 {
 54            Some(&[0xEF, 0xBB, 0xBF])
 55        } else if self.encoding == UTF_16BE {
 56            Some(&[0xFE, 0xFF])
 57        } else if self.encoding == UTF_16LE {
 58            Some(&[0xFF, 0xFE])
 59        } else {
 60            None
 61        }
 62    }
 63
 64    pub fn encode_chunk<'a>(&self, input: &'a str) -> anyhow::Result<Cow<'a, [u8]>> {
 65        if self.encoding == UTF_8 {
 66            Ok(Cow::Borrowed(input.as_bytes()))
 67        } else if self.encoding == UTF_16BE {
 68            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 69
 70            // Convert the input string to UTF-16BE bytes
 71            let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
 72
 73            data.extend(utf16be_bytes);
 74            Ok(Cow::Owned(data))
 75        } else if self.encoding == UTF_16LE {
 76            let mut data = Vec::<u8>::with_capacity(input.len() * 2);
 77
 78            // Convert the input string to UTF-16LE bytes
 79            let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
 80
 81            data.extend(utf16le_bytes);
 82            Ok(Cow::Owned(data))
 83        } else {
 84            // todo: should we error on invalid content when encoding?
 85            let (cow, _encoding_used, _had_errors) = self.encoding.encode(&input);
 86
 87            Ok(cow)
 88        }
 89    }
 90
 91    pub fn name(&self) -> &'static str {
 92        let name = self.encoding.name();
 93
 94        match name {
 95            "UTF-8" => "UTF-8",
 96            "UTF-16LE" => "UTF-16 LE",
 97            "UTF-16BE" => "UTF-16 BE",
 98            "windows-1252" => "Windows-1252",
 99            "windows-1251" => "Windows-1251",
100            "windows-1250" => "Windows-1250",
101            "ISO-8859-2" => "ISO 8859-2",
102            "ISO-8859-3" => "ISO 8859-3",
103            "ISO-8859-4" => "ISO 8859-4",
104            "ISO-8859-5" => "ISO 8859-5",
105            "ISO-8859-6" => "ISO 8859-6",
106            "ISO-8859-7" => "ISO 8859-7",
107            "ISO-8859-8" => "ISO 8859-8",
108            "ISO-8859-13" => "ISO 8859-13",
109            "ISO-8859-15" => "ISO 8859-15",
110            "KOI8-R" => "KOI8-R",
111            "KOI8-U" => "KOI8-U",
112            "macintosh" => "MacRoman",
113            "x-mac-cyrillic" => "Mac Cyrillic",
114            "windows-874" => "Windows-874",
115            "windows-1253" => "Windows-1253",
116            "windows-1254" => "Windows-1254",
117            "windows-1255" => "Windows-1255",
118            "windows-1256" => "Windows-1256",
119            "windows-1257" => "Windows-1257",
120            "windows-1258" => "Windows-1258",
121            "EUC-KR" => "Windows-949",
122            "EUC-JP" => "EUC-JP",
123            "ISO-2022-JP" => "ISO 2022-JP",
124            "GBK" => "GBK",
125            "gb18030" => "GB18030",
126            "Big5" => "Big5",
127            _ => name,
128        }
129    }
130
131    pub fn from_name(name: &str) -> Self {
132        let encoding = match name {
133            "UTF-8" => encoding_rs::UTF_8,
134            "UTF-16 LE" => encoding_rs::UTF_16LE,
135            "UTF-16 BE" => encoding_rs::UTF_16BE,
136            "Windows-1252" => encoding_rs::WINDOWS_1252,
137            "Windows-1251" => encoding_rs::WINDOWS_1251,
138            "Windows-1250" => encoding_rs::WINDOWS_1250,
139            "ISO 8859-2" => encoding_rs::ISO_8859_2,
140            "ISO 8859-3" => encoding_rs::ISO_8859_3,
141            "ISO 8859-4" => encoding_rs::ISO_8859_4,
142            "ISO 8859-5" => encoding_rs::ISO_8859_5,
143            "ISO 8859-6" => encoding_rs::ISO_8859_6,
144            "ISO 8859-7" => encoding_rs::ISO_8859_7,
145            "ISO 8859-8" => encoding_rs::ISO_8859_8,
146            "ISO 8859-13" => encoding_rs::ISO_8859_13,
147            "ISO 8859-15" => encoding_rs::ISO_8859_15,
148            "KOI8-R" => encoding_rs::KOI8_R,
149            "KOI8-U" => encoding_rs::KOI8_U,
150            "MacRoman" => encoding_rs::MACINTOSH,
151            "Mac Cyrillic" => encoding_rs::X_MAC_CYRILLIC,
152            "Windows-874" => encoding_rs::WINDOWS_874,
153            "Windows-1253" => encoding_rs::WINDOWS_1253,
154            "Windows-1254" => encoding_rs::WINDOWS_1254,
155            "Windows-1255" => encoding_rs::WINDOWS_1255,
156            "Windows-1256" => encoding_rs::WINDOWS_1256,
157            "Windows-1257" => encoding_rs::WINDOWS_1257,
158            "Windows-1258" => encoding_rs::WINDOWS_1258,
159            "Windows-949" => encoding_rs::EUC_KR,
160            "EUC-JP" => encoding_rs::EUC_JP,
161            "ISO 2022-JP" => encoding_rs::ISO_2022_JP,
162            "GBK" => encoding_rs::GBK,
163            "GB18030" => encoding_rs::GB18030,
164            "Big5" => encoding_rs::BIG5,
165            _ => encoding_rs::UTF_8, // Default to UTF-8 for unknown names
166        };
167
168        Encoding {
169            encoding,
170            with_bom: false,
171        }
172    }
173}
174
175#[derive(Default, Clone)]
176pub struct EncodingOptions {
177    pub expected: Encoding,
178    pub auto_detect: bool,
179}
180
181impl EncodingOptions {
182    pub fn process(&self, bytes: Vec<u8>) -> anyhow::Result<(Encoding, String)> {
183        let encoding = if self.auto_detect
184            && let Some(encoding) = Self::detect(&bytes)
185        {
186            encoding
187        } else {
188            self.expected
189        };
190
191        Ok((encoding, encoding.decode(bytes)?))
192    }
193
194    fn detect(bytes: &[u8]) -> Option<Encoding> {
195        if bytes.starts_with(&[0xFE, 0xFF]) {
196            Some(Encoding {
197                encoding: UTF_8,
198                with_bom: true,
199            })
200        } else if bytes.starts_with(&[0xFF, 0xFE]) {
201            Some(Encoding {
202                encoding: UTF_16LE,
203                with_bom: true,
204            })
205        } else if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
206            Some(Encoding {
207                encoding: UTF_8,
208                with_bom: true,
209            })
210        } else {
211            None
212        }
213    }
214}