1use encoding_rs;
2use std::{borrow::Cow, fmt::Debug};
3
4pub use encoding_rs::{
5 BIG5, EUC_JP, EUC_KR, GB18030, GBK, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4,
6 ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_8_I, ISO_8859_10, ISO_8859_13,
7 ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, UTF_16BE,
8 UTF_16LE, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254,
9 WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, X_MAC_CYRILLIC,
10};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub struct Encoding {
14 pub encoding: &'static encoding_rs::Encoding,
15 pub with_bom: bool,
16}
17
18impl Default for Encoding {
19 fn default() -> Self {
20 Encoding {
21 encoding: UTF_8,
22 with_bom: false,
23 }
24 }
25}
26
27impl Encoding {
28 pub fn decode(&self, input: Vec<u8>) -> anyhow::Result<String> {
29 if self.encoding == UTF_8 && !self.with_bom {
30 return Ok(String::from_utf8(input)?);
31 }
32 let Some(result) = self
33 .encoding
34 .decode_without_bom_handling_and_without_replacement(&input)
35 else {
36 return Err(anyhow::anyhow!(
37 "input is not valid {}",
38 self.encoding.name()
39 ));
40 };
41
42 if self.with_bom && result.starts_with("\u{FEFF}") {
43 Ok(result[3..].to_string())
44 } else {
45 Ok(result.into_owned())
46 }
47 }
48
49 pub fn bom(&self) -> Option<&'static [u8]> {
50 if !self.with_bom {
51 return None;
52 }
53 if self.encoding == UTF_8 {
54 Some(&[0xEF, 0xBB, 0xBF])
55 } else if self.encoding == UTF_16BE {
56 Some(&[0xFE, 0xFF])
57 } else if self.encoding == UTF_16LE {
58 Some(&[0xFF, 0xFE])
59 } else {
60 None
61 }
62 }
63
64 pub fn encode_chunk<'a>(&self, input: &'a str) -> anyhow::Result<Cow<'a, [u8]>> {
65 if self.encoding == UTF_8 {
66 Ok(Cow::Borrowed(input.as_bytes()))
67 } else if self.encoding == UTF_16BE {
68 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
69
70 // Convert the input string to UTF-16BE bytes
71 let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
72
73 data.extend(utf16be_bytes);
74 Ok(Cow::Owned(data))
75 } else if self.encoding == UTF_16LE {
76 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
77
78 // Convert the input string to UTF-16LE bytes
79 let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
80
81 data.extend(utf16le_bytes);
82 Ok(Cow::Owned(data))
83 } else {
84 // todo: should we error on invalid content when encoding?
85 let (cow, _encoding_used, _had_errors) = self.encoding.encode(&input);
86
87 Ok(cow)
88 }
89 }
90
91 pub fn name(&self) -> &'static str {
92 let name = self.encoding.name();
93
94 match name {
95 "UTF-8" => "UTF-8",
96 "UTF-16LE" => "UTF-16 LE",
97 "UTF-16BE" => "UTF-16 BE",
98 "windows-1252" => "Windows-1252",
99 "windows-1251" => "Windows-1251",
100 "windows-1250" => "Windows-1250",
101 "ISO-8859-2" => "ISO 8859-2",
102 "ISO-8859-3" => "ISO 8859-3",
103 "ISO-8859-4" => "ISO 8859-4",
104 "ISO-8859-5" => "ISO 8859-5",
105 "ISO-8859-6" => "ISO 8859-6",
106 "ISO-8859-7" => "ISO 8859-7",
107 "ISO-8859-8" => "ISO 8859-8",
108 "ISO-8859-13" => "ISO 8859-13",
109 "ISO-8859-15" => "ISO 8859-15",
110 "KOI8-R" => "KOI8-R",
111 "KOI8-U" => "KOI8-U",
112 "macintosh" => "MacRoman",
113 "x-mac-cyrillic" => "Mac Cyrillic",
114 "windows-874" => "Windows-874",
115 "windows-1253" => "Windows-1253",
116 "windows-1254" => "Windows-1254",
117 "windows-1255" => "Windows-1255",
118 "windows-1256" => "Windows-1256",
119 "windows-1257" => "Windows-1257",
120 "windows-1258" => "Windows-1258",
121 "EUC-KR" => "Windows-949",
122 "EUC-JP" => "EUC-JP",
123 "ISO-2022-JP" => "ISO 2022-JP",
124 "GBK" => "GBK",
125 "gb18030" => "GB18030",
126 "Big5" => "Big5",
127 _ => name,
128 }
129 }
130
131 pub fn from_name(name: &str) -> Self {
132 let encoding = match name {
133 "UTF-8" => encoding_rs::UTF_8,
134 "UTF-16 LE" => encoding_rs::UTF_16LE,
135 "UTF-16 BE" => encoding_rs::UTF_16BE,
136 "Windows-1252" => encoding_rs::WINDOWS_1252,
137 "Windows-1251" => encoding_rs::WINDOWS_1251,
138 "Windows-1250" => encoding_rs::WINDOWS_1250,
139 "ISO 8859-2" => encoding_rs::ISO_8859_2,
140 "ISO 8859-3" => encoding_rs::ISO_8859_3,
141 "ISO 8859-4" => encoding_rs::ISO_8859_4,
142 "ISO 8859-5" => encoding_rs::ISO_8859_5,
143 "ISO 8859-6" => encoding_rs::ISO_8859_6,
144 "ISO 8859-7" => encoding_rs::ISO_8859_7,
145 "ISO 8859-8" => encoding_rs::ISO_8859_8,
146 "ISO 8859-13" => encoding_rs::ISO_8859_13,
147 "ISO 8859-15" => encoding_rs::ISO_8859_15,
148 "KOI8-R" => encoding_rs::KOI8_R,
149 "KOI8-U" => encoding_rs::KOI8_U,
150 "MacRoman" => encoding_rs::MACINTOSH,
151 "Mac Cyrillic" => encoding_rs::X_MAC_CYRILLIC,
152 "Windows-874" => encoding_rs::WINDOWS_874,
153 "Windows-1253" => encoding_rs::WINDOWS_1253,
154 "Windows-1254" => encoding_rs::WINDOWS_1254,
155 "Windows-1255" => encoding_rs::WINDOWS_1255,
156 "Windows-1256" => encoding_rs::WINDOWS_1256,
157 "Windows-1257" => encoding_rs::WINDOWS_1257,
158 "Windows-1258" => encoding_rs::WINDOWS_1258,
159 "Windows-949" => encoding_rs::EUC_KR,
160 "EUC-JP" => encoding_rs::EUC_JP,
161 "ISO 2022-JP" => encoding_rs::ISO_2022_JP,
162 "GBK" => encoding_rs::GBK,
163 "GB18030" => encoding_rs::GB18030,
164 "Big5" => encoding_rs::BIG5,
165 _ => encoding_rs::UTF_8, // Default to UTF-8 for unknown names
166 };
167
168 Encoding {
169 encoding,
170 with_bom: false,
171 }
172 }
173}
174
175#[derive(Default, Clone)]
176pub struct EncodingOptions {
177 pub expected: Encoding,
178 pub auto_detect: bool,
179}
180
181impl EncodingOptions {
182 pub fn process(&self, bytes: Vec<u8>) -> anyhow::Result<(Encoding, String)> {
183 let encoding = if self.auto_detect
184 && let Some(encoding) = Self::detect(&bytes)
185 {
186 encoding
187 } else {
188 self.expected
189 };
190
191 Ok((encoding, encoding.decode(bytes)?))
192 }
193
194 fn detect(bytes: &[u8]) -> Option<Encoding> {
195 if bytes.starts_with(&[0xFE, 0xFF]) {
196 Some(Encoding {
197 encoding: UTF_8,
198 with_bom: true,
199 })
200 } else if bytes.starts_with(&[0xFF, 0xFE]) {
201 Some(Encoding {
202 encoding: UTF_16LE,
203 with_bom: true,
204 })
205 } else if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
206 Some(Encoding {
207 encoding: UTF_8,
208 with_bom: true,
209 })
210 } else {
211 None
212 }
213 }
214}