encodings.rs

 1//! Encoding and decoding utilities using the `encoding_rs` crate.
 2use std::fmt::Debug;
 3
 4use anyhow::Result;
 5use encoding_rs::Encoding;
 6
 7/// A wrapper around `encoding_rs::Encoding` to implement `Send` and `Sync`.
 8/// Since the reference is static, it is safe to send it across threads.
 9pub struct EncodingWrapper(&'static Encoding);
10
11impl Debug for EncodingWrapper {
12    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
13        f.debug_tuple(&format!("EncodingWrapper{:?}", self.0))
14            .field(&self.0.name())
15            .finish()
16    }
17}
18
19pub struct EncodingWrapperVisitor;
20
21impl PartialEq for EncodingWrapper {
22    fn eq(&self, other: &Self) -> bool {
23        self.0.name() == other.0.name()
24    }
25}
26
27unsafe impl Send for EncodingWrapper {}
28unsafe impl Sync for EncodingWrapper {}
29
30impl Clone for EncodingWrapper {
31    fn clone(&self) -> Self {
32        EncodingWrapper(self.0)
33    }
34}
35
36impl EncodingWrapper {
37    pub fn new(encoding: &'static Encoding) -> EncodingWrapper {
38        EncodingWrapper(encoding)
39    }
40
41    pub fn get_encoding(&self) -> &'static Encoding {
42        self.0
43    }
44
45    pub async fn decode(&self, input: Vec<u8>) -> Result<String> {
46        let (cow, _had_errors) = self.0.decode_with_bom_removal(&input);
47
48        // `encoding_rs` handles invalid bytes by replacing them with replacement characters
49        // in the output string, so we return the result even if there were errors.
50        // This preserves the original behaviour where files with invalid bytes could still be opened.
51        Ok(cow.into_owned())
52    }
53
54    pub async fn encode(&self, input: String) -> Result<Vec<u8>> {
55        if self.0 == encoding_rs::UTF_16BE {
56            let mut data = Vec::<u8>::new();
57            data.reserve(input.len() * 2); // Reserve space for UTF-16BE bytes
58
59            // Convert the input string to UTF-16BE bytes
60            let utf16be_bytes: Vec<u8> =
61                input.encode_utf16().flat_map(|u| u.to_be_bytes()).collect();
62
63            data.extend(utf16be_bytes);
64            return Ok(data);
65        } else if self.0 == encoding_rs::UTF_16LE {
66            let mut data = Vec::<u8>::new();
67            data.reserve(input.len() * 2); // Reserve space for UTF-16LE bytes
68
69            // Convert the input string to UTF-16LE bytes
70            let utf16le_bytes: Vec<u8> =
71                input.encode_utf16().flat_map(|u| u.to_le_bytes()).collect();
72
73            data.extend(utf16le_bytes);
74            return Ok(data);
75        } else {
76            let (cow, _encoding_used, _had_errors) = self.0.encode(&input);
77            // `encoding_rs` handles unencodable characters by replacing them with
78            // appropriate substitutes in the output, so we return the result even if there were errors.
79            // This maintains consistency with the decode behaviour.
80            Ok(cow.into_owned())
81        }
82    }
83}
84
85/// Convert a byte vector from a specified encoding to a UTF-8 string.
86pub async fn to_utf8(input: Vec<u8>, encoding: EncodingWrapper) -> Result<String> {
87    encoding.decode(input).await
88}
89
90/// Convert a UTF-8 string to a byte vector in a specified encoding.
91pub async fn from_utf8(input: String, target: EncodingWrapper) -> Result<Vec<u8>> {
92    target.encode(input).await
93}