1//! Encoding and decoding utilities using the `encoding_rs` crate.
2use std::{
3 fmt::Debug,
4 sync::{Arc, Mutex},
5};
6
7use anyhow::Result;
8use encoding_rs::Encoding;
9
10/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
11/// Since the reference is static, it is safe to send it across threads.
12pub struct EncodingWrapper(pub &'static Encoding);
13
14impl Debug for EncodingWrapper {
15 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
16 f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
17 .field(&self.0.name())
18 .finish()
19 }
20}
21
22impl Default for EncodingWrapper {
23 fn default() -> Self {
24 EncodingWrapper(encoding_rs::UTF_8)
25 }
26}
27
28pub struct EncodingWrapperVisitor;
29
30impl PartialEq for EncodingWrapper {
31 fn eq(&self, other: &Self) -> bool {
32 self.0.name() == other.0.name()
33 }
34}
35
36unsafe impl Send for EncodingWrapper {}
37unsafe impl Sync for EncodingWrapper {}
38
39impl Clone for EncodingWrapper {
40 fn clone(&self) -> Self {
41 EncodingWrapper(self.0)
42 }
43}
44
45impl EncodingWrapper {
46 pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
47 EncodingWrapper(encoding)
48 }
49
50 pub fn get_encoding(&self) -> &'static Encoding {
51 self.0
52 }
53
54 pub async fn decode(
55 &mut self,
56 input: Vec<u8>,
57 force: bool,
58 buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
59 ) -> Result<String> {
60 // Check if the input starts with a BOM for UTF-16 encodings only if not forced to
61 // use the encoding specified.
62 if !force {
63 if let Some(encoding) = match input.get(..2) {
64 Some([0xFF, 0xFE]) => Some(encoding_rs::UTF_16LE),
65 Some([0xFE, 0xFF]) => Some(encoding_rs::UTF_16BE),
66 _ => None,
67 } {
68 self.0 = encoding;
69
70 if let Some(v) = buffer_encoding {
71 if let Ok(mut v) = (*v).lock() {
72 *v = encoding;
73 }
74 }
75 }
76 }
77
78 let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
79
80 if !_had_errors {
81 Ok(cow.to_string())
82 } else {
83 // If there were decoding errors, return an error.
84 Err(anyhow::anyhow!(
85 "The file contains invalid bytes for the specified encoding: {}. This usually menas that the file is not a regular text file, or is encoded in a different encoding. Continuing to open it may result in data loss if saved.",
86 self.0.name()
87 ))
88 }
89 }
90
91 pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
92 if self.0 == encoding_rs::UTF_16BE {
93 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
94
95 // Convert the input string to UTF-16BE bytes
96 let utf16be_bytes = input.encode_utf16().flat_map(|u| u.to_be_bytes());
97
98 data.extend(utf16be_bytes);
99 return Ok(data);
100 } else if self.0 == encoding_rs::UTF_16LE {
101 let mut data = Vec::<u8>::with_capacity(input.len() * 2);
102
103 // Convert the input string to UTF-16LE bytes
104 let utf16le_bytes = input.encode_utf16().flat_map(|u| u.to_le_bytes());
105
106 data.extend(utf16le_bytes);
107 return Ok(data);
108 } else {
109 let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
110 // `encoding_rs` handles unencodable characters by replacing them with
111 // appropriate substitutes in the output, so we return the result even if there were errors.
112 // This maintains consistency with the decode behaviour.
113 Ok(cow.into_owned())
114 }
115 }
116}
117
118/// Convert a byte vector from a specified encoding to a UTF-8 string.
119pub async fn to_utf8(
120 input: Vec<u8>,
121 mut encoding: EncodingWrapper,
122 force: bool,
123 buffer_encoding: Option<Arc<Mutex<&'static Encoding>>>,
124) -> Result<String> {
125 encoding.decode(input, force, buffer_encoding).await
126}
127
128/// Convert a UTF-8 string to a byte vector in a specified encoding.
129pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
130 target.encode(input).await
131}