file_content.rs

  1pub const FILE_ANALYSIS_BYTES: usize = 1024;
  2
  3#[derive(Debug, PartialEq)]
  4pub enum ByteContent {
  5    Utf16Le,
  6    Utf16Be,
  7    Binary,
  8    Unknown,
  9}
 10
 11// Heuristic check using null byte distribution plus a generic text-likeness
 12// heuristic. This prefers UTF-16 when many bytes are NUL and otherwise
 13// distinguishes between text-like and binary-like content.
 14pub fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
 15    if bytes.len() < 2 {
 16        return ByteContent::Unknown;
 17    }
 18
 19    if is_known_binary_header(bytes) {
 20        return ByteContent::Binary;
 21    }
 22
 23    let limit = bytes.len().min(FILE_ANALYSIS_BYTES);
 24    let mut even_null_count = 0usize;
 25    let mut odd_null_count = 0usize;
 26    let mut non_text_like_count = 0usize;
 27
 28    for (i, &byte) in bytes[..limit].iter().enumerate() {
 29        if byte == 0 {
 30            if i % 2 == 0 {
 31                even_null_count += 1;
 32            } else {
 33                odd_null_count += 1;
 34            }
 35            non_text_like_count += 1;
 36            continue;
 37        }
 38
 39        let is_text_like = match byte {
 40            b'\t' | b'\n' | b'\r' | 0x0C => true,
 41            0x20..=0x7E => true,
 42            // Treat bytes that are likely part of UTF-8 or single-byte encodings as text-like.
 43            0x80..=0xBF | 0xC2..=0xF4 => true,
 44            _ => false,
 45        };
 46
 47        if !is_text_like {
 48            non_text_like_count += 1;
 49        }
 50    }
 51
 52    let total_null_count = even_null_count + odd_null_count;
 53
 54    // If there are no NUL bytes at all, this is overwhelmingly likely to be text.
 55    if total_null_count == 0 {
 56        return ByteContent::Unknown;
 57    }
 58
 59    let has_significant_nulls = total_null_count >= limit / 16;
 60    let nulls_skew_to_even = even_null_count > odd_null_count * 4;
 61    let nulls_skew_to_odd = odd_null_count > even_null_count * 4;
 62
 63    if has_significant_nulls {
 64        let sample = &bytes[..limit];
 65
 66        // UTF-16BE ASCII: [0x00, char] — nulls at even positions (high byte first)
 67        // UTF-16LE ASCII: [char, 0x00] — nulls at odd positions (low byte first)
 68
 69        if nulls_skew_to_even && is_plausible_utf16_text(sample, false) {
 70            return ByteContent::Utf16Be;
 71        }
 72
 73        if nulls_skew_to_odd && is_plausible_utf16_text(sample, true) {
 74            return ByteContent::Utf16Le;
 75        }
 76
 77        return ByteContent::Binary;
 78    }
 79
 80    if non_text_like_count * 100 < limit * 8 {
 81        ByteContent::Unknown
 82    } else {
 83        ByteContent::Binary
 84    }
 85}
 86
 87fn is_known_binary_header(bytes: &[u8]) -> bool {
 88    bytes.starts_with(b"%PDF-") // PDF
 89        || bytes.starts_with(b"PK\x03\x04") // ZIP local header
 90        || bytes.starts_with(b"PK\x05\x06") // ZIP end of central directory
 91        || bytes.starts_with(b"PK\x07\x08") // ZIP spanning/splitting
 92        || bytes.starts_with(b"\x89PNG\r\n\x1a\n") // PNG
 93        || bytes.starts_with(b"\xFF\xD8\xFF") // JPEG
 94        || bytes.starts_with(b"GIF87a") // GIF87a
 95        || bytes.starts_with(b"GIF89a") // GIF89a
 96        || bytes.starts_with(b"IWAD") // Doom IWAD archive
 97        || bytes.starts_with(b"PWAD") // Doom PWAD archive
 98        || bytes.starts_with(b"RIFF") // WAV, AVI, WebP
 99        || bytes.starts_with(b"OggS") // OGG (Vorbis, Opus, FLAC)
100        || bytes.starts_with(b"fLaC") // FLAC
101        || bytes.starts_with(b"ID3") // MP3 with ID3v2 tag
102        || bytes.starts_with(b"\xFF\xFB") // MP3 frame sync (MPEG1 Layer3)
103        || bytes.starts_with(b"\xFF\xFA") // MP3 frame sync (MPEG1 Layer3)
104        || bytes.starts_with(b"\xFF\xF3") // MP3 frame sync (MPEG2 Layer3)
105        || bytes.starts_with(b"\xFF\xF2") // MP3 frame sync (MPEG2 Layer3)
106}
107
108// Null byte skew alone is not enough to identify UTF-16 -- binary formats with
109// small 16-bit values (like PCM audio) produce the same pattern. Decode the
110// bytes as UTF-16 and reject if too many code units land in control character
111// ranges or form unpaired surrogates, which real text almost never contains.
112fn is_plausible_utf16_text(bytes: &[u8], little_endian: bool) -> bool {
113    let mut suspicious_count = 0usize;
114    let mut total = 0usize;
115
116    let mut i = 0;
117    while let Some(code_unit) = read_u16(bytes, i, little_endian) {
118        total += 1;
119
120        match code_unit {
121            0x0009 | 0x000A | 0x000C | 0x000D => {}
122            // C0/C1 control characters and non-characters
123            0x0000..=0x001F | 0x007F..=0x009F | 0xFFFE | 0xFFFF => suspicious_count += 1,
124            0xD800..=0xDBFF => {
125                let next_offset = i + 2;
126                let has_low_surrogate = read_u16(bytes, next_offset, little_endian)
127                    .is_some_and(|next| (0xDC00..=0xDFFF).contains(&next));
128                if has_low_surrogate {
129                    total += 1;
130                    i += 2;
131                } else {
132                    suspicious_count += 1;
133                }
134            }
135            // Lone low surrogate without a preceding high surrogate
136            0xDC00..=0xDFFF => suspicious_count += 1,
137            _ => {}
138        }
139
140        i += 2;
141    }
142
143    if total == 0 {
144        return false;
145    }
146
147    // Real UTF-16 text has near-zero control characters; binary data with
148    // small 16-bit values typically exceeds 5%. 2% provides a safe margin.
149    suspicious_count * 100 < total * 2
150}
151
152fn read_u16(bytes: &[u8], offset: usize, little_endian: bool) -> Option<u16> {
153    let pair = [*bytes.get(offset)?, *bytes.get(offset + 1)?];
154    if little_endian {
155        return Some(u16::from_le_bytes(pair));
156    }
157    Some(u16::from_be_bytes(pair))
158}