1use encoding_rs;
2use std::{
3 fmt::Debug,
4 sync::{Arc, Mutex, atomic::AtomicBool},
5};
6
7pub use encoding_rs::{
8 BIG5, EUC_JP, EUC_KR, GB18030, GBK, IBM866, ISO_2022_JP, ISO_8859_2, ISO_8859_3, ISO_8859_4,
9 ISO_8859_5, ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_8_I, ISO_8859_10, ISO_8859_13,
10 ISO_8859_14, ISO_8859_15, ISO_8859_16, KOI8_R, KOI8_U, MACINTOSH, SHIFT_JIS, UTF_8, UTF_16BE,
11 UTF_16LE, WINDOWS_874, WINDOWS_1250, WINDOWS_1251, WINDOWS_1252, WINDOWS_1253, WINDOWS_1254,
12 WINDOWS_1255, WINDOWS_1256, WINDOWS_1257, WINDOWS_1258, X_MAC_CYRILLIC,
13};
14
15pub struct Encoding(Mutex<&'static encoding_rs::Encoding>);
16
17impl Debug for Encoding {
18 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19 f.debug_tuple(&format!("Encoding{:?}", self.0))
20 .field(&self.get().name())
21 .finish()
22 }
23}
24
25impl Clone for Encoding {
26 fn clone(&self) -> Self {
27 Encoding(Mutex::new(self.get()))
28 }
29}
30
31impl Default for Encoding {
32 fn default() -> Self {
33 Encoding(Mutex::new(UTF_8))
34 }
35}
36
37impl From<&'static encoding_rs::Encoding> for Encoding {
38 fn from(encoding: &'static encoding_rs::Encoding) -> Self {
39 Encoding::new(encoding)
40 }
41}
42
43unsafe impl Send for Encoding {}
44unsafe impl Sync for Encoding {}
45
46impl Encoding {
47 pub fn new(encoding: &'static encoding_rs::Encoding) -> Self {
48 Self(Mutex::new(encoding))
49 }
50
51 pub fn set(&self, encoding: &'static encoding_rs::Encoding) {
52 *self.0.lock().unwrap() = encoding;
53 }
54
55 pub fn get(&self) -> &'static encoding_rs::Encoding {
56 *self.0.lock().unwrap()
57 }
58
59 pub async fn decode(
60 &self,
61 input: Vec<u8>,
62 force: bool,
63 detect_utf16: bool,
64 buffer_encoding: Option<Arc<Encoding>>,
65 ) -> anyhow::Result<String> {
66 // Check if the input starts with a BOM for UTF-16 encodings only if detect_utf16 is true.
67 if detect_utf16 {
68 if let Some(encoding) = match input.get(..2) {
69 Some([0xFF, 0xFE]) => Some(UTF_16LE),
70 Some([0xFE, 0xFF]) => Some(UTF_16BE),
71 _ => None,
72 } {
73 self.set(encoding);
74
75 if let Some(v) = buffer_encoding {
76 v.set(encoding)
77 }
78 }
79 }
80
81 let (cow, had_errors) = self.get().decode_with_bom_removal(&input);
82
83 if force {
84 return Ok(cow.to_string());
85 }
86
87 if !had_errors {
88 Ok(cow.to_string())
89 } else {
90 Err(anyhow::anyhow!(
91 "The file contains invalid bytes for the specified encoding: {}.\nThis usually means that the file is not a regular text file, or is encoded in a different encoding.\nContinuing to open it may result in data loss if saved.",
92 self.get().name()
93 ))
94 }
95 }
96
97 pub async fn encode(&self, input: String) -> anyhow::Result<Vec<u8>> {
98 if self.get() == UTF_16BE {
99 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
100
101 // Convert the input string to UTF-16BE bytes
102 let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
103
104 data.extend(utf16be_bytes);
105 return Ok(data);
106 } else if self.get() == UTF_16LE {
107 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
108
109 // Convert the input string to UTF-16LE bytes
110 let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
111
112 data.extend(utf16le_bytes);
113 return Ok(data);
114 } else {
115 let (cow, _encoding_used, _had_errors) = self.get().encode(&input);
116
117 Ok(cow.into_owned())
118 }
119 }
120
121 pub fn reset(&self) {
122 self.set(UTF_8);
123 }
124}
125
126/// Convert a byte vector from a specified encoding to a UTF-8 string.
127pub async fn to_utf8(
128 input: Vec<u8>,
129 options: &EncodingOptions,
130 buffer_encoding: Option<Arc<Encoding>>,
131) -> anyhow::Result<String> {
132 options
133 .encoding
134 .decode(
135 input,
136 options.force.load(std::sync::atomic::Ordering::Acquire),
137 options
138 .detect_utf16
139 .load(std::sync::atomic::Ordering::Acquire),
140 buffer_encoding,
141 )
142 .await
143}
144
145/// Convert a UTF-8 string to a byte vector in a specified encoding.
146pub async fn from_utf8(input: String, target: Encoding) -> anyhow::Result<Vec<u8>> {
147 target.encode(input).await
148}
149
150pub struct EncodingOptions {
151 pub encoding: Arc<Encoding>,
152 pub force: AtomicBool,
153 pub detect_utf16: AtomicBool,
154}
155
156impl EncodingOptions {
157 pub fn reset(&self) {
158 self.encoding.reset();
159
160 self.force
161 .store(false, std::sync::atomic::Ordering::Release);
162
163 self.detect_utf16
164 .store(true, std::sync::atomic::Ordering::Release);
165 }
166}
167
168impl Default for EncodingOptions {
169 fn default() -> Self {
170 EncodingOptions {
171 encoding: Arc::new(Encoding::default()),
172 force: AtomicBool::new(false),
173 detect_utf16: AtomicBool::new(true),
174 }
175 }
176}
177
178impl Clone for EncodingOptions {
179 fn clone(&self) -> Self {
180 EncodingOptions {
181 encoding: Arc::new(self.encoding.get().into()),
182 force: AtomicBool::new(self.force.load(std::sync::atomic::Ordering::Acquire)),
183 detect_utf16: AtomicBool::new(
184 self.detect_utf16.load(std::sync::atomic::Ordering::Acquire),
185 ),
186 }
187 }
188}