1//! Encoding and decoding utilities using the `encoding_rs` crate.
2use std::{
3 fmt::Debug,
4 sync::{Arc, Mutex},
5};
6
7use anyhow::Result;
8use encoding_rs::Encoding;
9
10/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
11/// Since the reference is static, it is safe to send it across threads.
12pub struct EncodingWrapper(&'static Encoding);
13
14impl Debug for EncodingWrapper {
15 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
16 f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
17 .field(&self.0.name())
18 .finish()
19 }
20}
21
22pub struct EncodingWrapperVisitor;
23
24impl PartialEq for EncodingWrapper {
25 fn eq(&self, other: &Self) -> bool {
26 self.0.name() == other.0.name()
27 }
28}
29
30unsafe impl Send for EncodingWrapper {}
31unsafe impl Sync for EncodingWrapper {}
32
33impl Clone for EncodingWrapper {
34 fn clone(&self) -> Self {
35 EncodingWrapper(self.0)
36 }
37}
38
39impl EncodingWrapper {
40 pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
41 EncodingWrapper(encoding)
42 }
43
44 pub fn get_encoding(&self) -> &'static Encoding {
45 self.0
46 }
47
48 pub async fn decode(
49 &mut self,
50 input: Vec<u8>,
51 force: bool,
52 buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
53 ) -> Result<String> {
54 // Check if the input starts with a BOM for UTF-16 encodings only if not forced to
55 // use the encoding specified.
56 if !force {
57 if input.len() >= 2 {
58 if (input[0] == 0xFF) & (input[1] == 0xFE) {
59 self.0 = encoding_rs::UTF_16LE;
60
61 if let Some(v) = buffer_encoding {
62 if let Ok(mut v) = (*v).lock() {
63 *v = encoding_rs::UTF_16LE;
64 }
65 }
66 } else if (input.len() >= 2) & (input[0] == 0xFE) & (input[1] == 0xFF) {
67 self.0 = encoding_rs::UTF_16BE;
68
69 if let Some(v) = buffer_encoding {
70 if let Ok(mut v) = (*v).lock() {
71 *v = encoding_rs::UTF_16BE;
72 }
73 }
74 }
75 }
76 }
77
78 let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
79
80 // `encoding_rs` handles invalid bytes by replacing them with replacement characters
81 // in the output string, so we return the result even if there were errors.
82 // This preserves the original behaviour where files with invalid bytes could still be opened.
83 Ok(cow.into_owned())
84 }
85
86 pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
87 if self.0 == encoding_rs::UTF_16BE {
88 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
89
90 // Convert the input string to UTF-16BE bytes
91 let utf16be_bytes: Vec<u8> =
92 input.encode_utf16().flat_map(|u| u.to_be_bytes()).collect();
93
94 data.extend(utf16be_bytes);
95 return Ok(data);
96 } else if self.0 == encoding_rs::UTF_16LE {
97 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
98
99 // Convert the input string to UTF-16LE bytes
100 let utf16le_bytes: Vec<u8> =
101 input.encode_utf16().flat_map(|u| u.to_le_bytes()).collect();
102
103 data.extend(utf16le_bytes);
104 return Ok(data);
105 } else {
106 let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
107 // `encoding_rs` handles unencodable characters by replacing them with
108 // appropriate substitutes in the output, so we return the result even if there were errors.
109 // This maintains consistency with the decode behaviour.
110 Ok(cow.into_owned())
111 }
112 }
113}
114
115/// Convert a byte vector from a specified encoding to a UTF-8 string.
116pub async fn to_utf8(
117 input: Vec<u8>,
118 mut encoding: EncodingWrapper,
119 force: bool,
120 buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
121) -> Result<String> {
122 encoding.decode(input, force, buffer_encoding).await
123}
124
125/// Convert a UTF-8 string to a byte vector in a specified encoding.
126pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
127 target.encode(input).await
128}