1pub const FILE_ANALYSIS_BYTES: usize = 1024;
2
3#[derive(Debug, PartialEq)]
4pub enum ByteContent {
5 Utf16Le,
6 Utf16Be,
7 Binary,
8 Unknown,
9}
10
11// Heuristic check using null byte distribution plus a generic text-likeness
12// heuristic. This prefers UTF-16 when many bytes are NUL and otherwise
13// distinguishes between text-like and binary-like content.
14pub fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
15 if bytes.len() < 2 {
16 return ByteContent::Unknown;
17 }
18
19 if is_known_binary_header(bytes) {
20 return ByteContent::Binary;
21 }
22
23 let limit = bytes.len().min(FILE_ANALYSIS_BYTES);
24 let mut even_null_count = 0usize;
25 let mut odd_null_count = 0usize;
26 let mut non_text_like_count = 0usize;
27
28 for (i, &byte) in bytes[..limit].iter().enumerate() {
29 if byte == 0 {
30 if i % 2 == 0 {
31 even_null_count += 1;
32 } else {
33 odd_null_count += 1;
34 }
35 non_text_like_count += 1;
36 continue;
37 }
38
39 let is_text_like = match byte {
40 b'\t' | b'\n' | b'\r' | 0x0C => true,
41 0x20..=0x7E => true,
42 // Treat bytes that are likely part of UTF-8 or single-byte encodings as text-like.
43 0x80..=0xBF | 0xC2..=0xF4 => true,
44 _ => false,
45 };
46
47 if !is_text_like {
48 non_text_like_count += 1;
49 }
50 }
51
52 let total_null_count = even_null_count + odd_null_count;
53
54 // If there are no NUL bytes at all, this is overwhelmingly likely to be text.
55 if total_null_count == 0 {
56 return ByteContent::Unknown;
57 }
58
59 let has_significant_nulls = total_null_count >= limit / 16;
60 let nulls_skew_to_even = even_null_count > odd_null_count * 4;
61 let nulls_skew_to_odd = odd_null_count > even_null_count * 4;
62
63 if has_significant_nulls {
64 let sample = &bytes[..limit];
65
66 // UTF-16BE ASCII: [0x00, char] — nulls at even positions (high byte first)
67 // UTF-16LE ASCII: [char, 0x00] — nulls at odd positions (low byte first)
68
69 if nulls_skew_to_even && is_plausible_utf16_text(sample, false) {
70 return ByteContent::Utf16Be;
71 }
72
73 if nulls_skew_to_odd && is_plausible_utf16_text(sample, true) {
74 return ByteContent::Utf16Le;
75 }
76
77 return ByteContent::Binary;
78 }
79
80 if non_text_like_count * 100 < limit * 8 {
81 ByteContent::Unknown
82 } else {
83 ByteContent::Binary
84 }
85}
86
87fn is_known_binary_header(bytes: &[u8]) -> bool {
88 bytes.starts_with(b"%PDF-") // PDF
89 || bytes.starts_with(b"PK\x03\x04") // ZIP local header
90 || bytes.starts_with(b"PK\x05\x06") // ZIP end of central directory
91 || bytes.starts_with(b"PK\x07\x08") // ZIP spanning/splitting
92 || bytes.starts_with(b"\x89PNG\r\n\x1a\n") // PNG
93 || bytes.starts_with(b"\xFF\xD8\xFF") // JPEG
94 || bytes.starts_with(b"GIF87a") // GIF87a
95 || bytes.starts_with(b"GIF89a") // GIF89a
96 || bytes.starts_with(b"IWAD") // Doom IWAD archive
97 || bytes.starts_with(b"PWAD") // Doom PWAD archive
98 || bytes.starts_with(b"RIFF") // WAV, AVI, WebP
99 || bytes.starts_with(b"OggS") // OGG (Vorbis, Opus, FLAC)
100 || bytes.starts_with(b"fLaC") // FLAC
101 || bytes.starts_with(b"ID3") // MP3 with ID3v2 tag
102 || bytes.starts_with(b"\xFF\xFB") // MP3 frame sync (MPEG1 Layer3)
103 || bytes.starts_with(b"\xFF\xFA") // MP3 frame sync (MPEG1 Layer3)
104 || bytes.starts_with(b"\xFF\xF3") // MP3 frame sync (MPEG2 Layer3)
105 || bytes.starts_with(b"\xFF\xF2") // MP3 frame sync (MPEG2 Layer3)
106}
107
108// Null byte skew alone is not enough to identify UTF-16 -- binary formats with
109// small 16-bit values (like PCM audio) produce the same pattern. Decode the
110// bytes as UTF-16 and reject if too many code units land in control character
111// ranges or form unpaired surrogates, which real text almost never contains.
112fn is_plausible_utf16_text(bytes: &[u8], little_endian: bool) -> bool {
113 let mut suspicious_count = 0usize;
114 let mut total = 0usize;
115
116 let mut i = 0;
117 while let Some(code_unit) = read_u16(bytes, i, little_endian) {
118 total += 1;
119
120 match code_unit {
121 0x0009 | 0x000A | 0x000C | 0x000D => {}
122 // C0/C1 control characters and non-characters
123 0x0000..=0x001F | 0x007F..=0x009F | 0xFFFE | 0xFFFF => suspicious_count += 1,
124 0xD800..=0xDBFF => {
125 let next_offset = i + 2;
126 let has_low_surrogate = read_u16(bytes, next_offset, little_endian)
127 .is_some_and(|next| (0xDC00..=0xDFFF).contains(&next));
128 if has_low_surrogate {
129 total += 1;
130 i += 2;
131 } else {
132 suspicious_count += 1;
133 }
134 }
135 // Lone low surrogate without a preceding high surrogate
136 0xDC00..=0xDFFF => suspicious_count += 1,
137 _ => {}
138 }
139
140 i += 2;
141 }
142
143 if total == 0 {
144 return false;
145 }
146
147 // Real UTF-16 text has near-zero control characters; binary data with
148 // small 16-bit values typically exceeds 5%. 2% provides a safe margin.
149 suspicious_count * 100 < total * 2
150}
151
152fn read_u16(bytes: &[u8], offset: usize, little_endian: bool) -> Option<u16> {
153 let pair = [*bytes.get(offset)?, *bytes.get(offset + 1)?];
154 if little_endian {
155 return Some(u16::from_le_bytes(pair));
156 }
157 Some(u16::from_be_bytes(pair))
158}