worktree: Fix binary files misdetected as UTF-16 (#50890)

crates/worktree/src/worktree.rs 🔗

@@ -6061,7 +6061,7 @@ fn decode_byte_full(
     }
 }
 
-#[derive(PartialEq)]
+#[derive(Debug, PartialEq)]
 enum ByteContent {
     Utf16Le,
     Utf16Be,
@@ -6117,13 +6117,24 @@ fn analyze_byte_content(bytes: &[u8]) -> ByteContent {
         return ByteContent::Unknown;
     }
 
-    if total_null_count >= limit / 16 {
-        if even_null_count > odd_null_count * 4 {
+    let has_significant_nulls = total_null_count >= limit / 16;
+    let nulls_skew_to_even = even_null_count > odd_null_count * 4;
+    let nulls_skew_to_odd = odd_null_count > even_null_count * 4;
+
+    if has_significant_nulls {
+        let sample = &bytes[..limit];
+
+        // UTF-16BE ASCII: [0x00, char] — nulls at even positions (high byte first)
+        // UTF-16LE ASCII: [char, 0x00] — nulls at odd positions (low byte first)
+
+        if nulls_skew_to_even && is_plausible_utf16_text(sample, false) {
             return ByteContent::Utf16Be;
         }
-        if odd_null_count > even_null_count * 4 {
+
+        if nulls_skew_to_odd && is_plausible_utf16_text(sample, true) {
             return ByteContent::Utf16Le;
         }
+
         return ByteContent::Binary;
     }
 
@@ -6145,4 +6156,208 @@ fn is_known_binary_header(bytes: &[u8]) -> bool {
         || bytes.starts_with(b"GIF89a") // GIF89a
         || bytes.starts_with(b"IWAD") // Doom IWAD archive
         || bytes.starts_with(b"PWAD") // Doom PWAD archive
+        || bytes.starts_with(b"RIFF") // WAV, AVI, WebP
+        || bytes.starts_with(b"OggS") // OGG (Vorbis, Opus, FLAC)
+        || bytes.starts_with(b"fLaC") // FLAC
+        || bytes.starts_with(b"ID3") // MP3 with ID3v2 tag
+        || bytes.starts_with(b"\xFF\xFB") // MP3 frame sync (MPEG1 Layer3)
+        || bytes.starts_with(b"\xFF\xFA") // MP3 frame sync (MPEG1 Layer3)
+        || bytes.starts_with(b"\xFF\xF3") // MP3 frame sync (MPEG2 Layer3)
+        || bytes.starts_with(b"\xFF\xF2") // MP3 frame sync (MPEG2 Layer3)
+}
+
+// Null byte skew alone is not enough to identify UTF-16 -- binary formats with
+// small 16-bit values (like PCM audio) produce the same pattern. Decode the
+// bytes as UTF-16 and reject if too many code units land in control character
+// ranges or form unpaired surrogates, which real text almost never contains.
+fn is_plausible_utf16_text(bytes: &[u8], little_endian: bool) -> bool {
+    let mut suspicious_count = 0usize;
+    let mut total = 0usize;
+
+    let mut i = 0;
+    while let Some(code_unit) = read_u16(bytes, i, little_endian) {
+        total += 1;
+
+        match code_unit {
+            0x0009 | 0x000A | 0x000C | 0x000D => {}
+            // C0/C1 control characters and non-characters
+            0x0000..=0x001F | 0x007F..=0x009F | 0xFFFE | 0xFFFF => suspicious_count += 1,
+            0xD800..=0xDBFF => {
+                let next_offset = i + 2;
+                let has_low_surrogate = read_u16(bytes, next_offset, little_endian)
+                    .is_some_and(|next| (0xDC00..=0xDFFF).contains(&next));
+                if has_low_surrogate {
+                    total += 1;
+                    i += 2;
+                } else {
+                    suspicious_count += 1;
+                }
+            }
+            // Lone low surrogate without a preceding high surrogate
+            0xDC00..=0xDFFF => suspicious_count += 1,
+            _ => {}
+        }
+
+        i += 2;
+    }
+
+    if total == 0 {
+        return false;
+    }
+
+    // Real UTF-16 text has near-zero control characters; binary data with
+    // small 16-bit values typically exceeds 5%. 2% provides a safe margin.
+    suspicious_count * 100 < total * 2
+}
+
+fn read_u16(bytes: &[u8], offset: usize, little_endian: bool) -> Option<u16> {
+    let pair = [*bytes.get(offset)?, *bytes.get(offset + 1)?];
+    if little_endian {
+        return Some(u16::from_le_bytes(pair));
+    }
+    Some(u16::from_be_bytes(pair))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// reproduction of issue #50785
+    fn build_pcm16_wav_bytes() -> Vec<u8> {
+        let header: Vec<u8> = vec![
+            /*  RIFF header  */
+            0x52, 0x49, 0x46, 0x46, // "RIFF"
+            0xc6, 0xcf, 0x00, 0x00, // file size: 8
+            0x57, 0x41, 0x56, 0x45, // "WAVE"
+            /*  fmt chunk  */
+            0x66, 0x6d, 0x74, 0x20, // "fmt "
+            0x10, 0x00, 0x00, 0x00, // chunk size: 16
+            0x01, 0x00, // format: PCM (1)
+            0x01, 0x00, // channels: 1 (mono)
+            0x80, 0x3e, 0x00, 0x00, // sample rate: 16000
+            0x00, 0x7d, 0x00, 0x00, // byte rate: 32000
+            0x02, 0x00, // block align: 2
+            0x10, 0x00, // bits per sample: 16
+            /*  LIST chunk  */
+            0x4c, 0x49, 0x53, 0x54, // "LIST"
+            0x1a, 0x00, 0x00, 0x00, // chunk size: 26
+            0x49, 0x4e, 0x46, 0x4f, // "INFO"
+            0x49, 0x53, 0x46, 0x54, // "ISFT"
+            0x0d, 0x00, 0x00, 0x00, // sub-chunk size: 13
+            0x4c, 0x61, 0x76, 0x66, 0x36, 0x32, 0x2e, 0x33, // "Lavf62.3"
+            0x2e, 0x31, 0x30, 0x30, 0x00, // ".100\0"
+            /* padding byte for word alignment */
+            0x00, // data chunk header
+            0x64, 0x61, 0x74, 0x61, // "data"
+            0x80, 0xcf, 0x00, 0x00, // chunk size
+        ];
+
+        let mut bytes = header;
+
+        // fill remaining space up to `FILE_ANALYSIS_BYTES` with synthetic PCM
+        let audio_bytes_needed = FILE_ANALYSIS_BYTES - bytes.len();
+        for i in 0..(audio_bytes_needed / 2) {
+            let sample = (i & 0xFF) as u8;
+            bytes.push(sample); // low byte: varies
+            bytes.push(0x00); // high byte: zero for small values
+        }
+
+        bytes
+    }
+
+    #[test]
+    fn test_pcm16_wav_detected_as_binary() {
+        let wav_bytes = build_pcm16_wav_bytes();
+        assert_eq!(wav_bytes.len(), FILE_ANALYSIS_BYTES);
+
+        let result = analyze_byte_content(&wav_bytes);
+        assert_eq!(
+            result,
+            ByteContent::Binary,
+            "PCM 16-bit WAV should be detected as Binary via RIFF header"
+        );
+    }
+
+    #[test]
+    fn test_le16_binary_not_misdetected_as_utf16le() {
+        let mut bytes = b"FAKE".to_vec();
+        while bytes.len() < FILE_ANALYSIS_BYTES {
+            let sample = (bytes.len() & 0xFF) as u8;
+            bytes.push(sample);
+            bytes.push(0x00);
+        }
+        bytes.truncate(FILE_ANALYSIS_BYTES);
+
+        let result = analyze_byte_content(&bytes);
+        assert_eq!(
+            result,
+            ByteContent::Binary,
+            "LE 16-bit binary with control characters should be detected as Binary"
+        );
+    }
+
+    #[test]
+    fn test_be16_binary_not_misdetected_as_utf16be() {
+        let mut bytes = b"FAKE".to_vec();
+        while bytes.len() < FILE_ANALYSIS_BYTES {
+            bytes.push(0x00);
+            let sample = (bytes.len() & 0xFF) as u8;
+            bytes.push(sample);
+        }
+        bytes.truncate(FILE_ANALYSIS_BYTES);
+
+        let result = analyze_byte_content(&bytes);
+        assert_eq!(
+            result,
+            ByteContent::Binary,
+            "BE 16-bit binary with control characters should be detected as Binary"
+        );
+    }
+
+    #[test]
+    fn test_utf16le_text_detected_as_utf16le() {
+        let text = "Hello, world! This is a UTF-16 test string. ";
+        let mut bytes = Vec::new();
+        while bytes.len() < FILE_ANALYSIS_BYTES {
+            bytes.extend(text.encode_utf16().flat_map(|u| u.to_le_bytes()));
+        }
+        bytes.truncate(FILE_ANALYSIS_BYTES);
+
+        assert_eq!(analyze_byte_content(&bytes), ByteContent::Utf16Le);
+    }
+
+    #[test]
+    fn test_utf16be_text_detected_as_utf16be() {
+        let text = "Hello, world! This is a UTF-16 test string. ";
+        let mut bytes = Vec::new();
+        while bytes.len() < FILE_ANALYSIS_BYTES {
+            bytes.extend(text.encode_utf16().flat_map(|u| u.to_be_bytes()));
+        }
+        bytes.truncate(FILE_ANALYSIS_BYTES);
+
+        assert_eq!(analyze_byte_content(&bytes), ByteContent::Utf16Be);
+    }
+
+    #[test]
+    fn test_known_binary_headers() {
+        let cases: &[(&[u8], &str)] = &[
+            (b"RIFF\x00\x00\x00\x00WAVE", "WAV"),
+            (b"RIFF\x00\x00\x00\x00AVI ", "AVI"),
+            (b"OggS\x00\x02", "OGG"),
+            (b"fLaC\x00\x00", "FLAC"),
+            (b"ID3\x03\x00", "MP3 ID3v2"),
+            (b"\xFF\xFB\x90\x00", "MP3 MPEG1 Layer3"),
+            (b"\xFF\xF3\x90\x00", "MP3 MPEG2 Layer3"),
+        ];
+
+        for (header, label) in cases {
+            let mut bytes = header.to_vec();
+            bytes.resize(FILE_ANALYSIS_BYTES, 0x41); // pad with 'A'
+            assert_eq!(
+                analyze_byte_content(&bytes),
+                ByteContent::Binary,
+                "{label} should be detected as Binary"
+            );
+        }
+    }
 }
worktree: Fix binary files misdetected as UTF-16 (#50890)

Change summary

Detailed changes

crates/worktree/src/worktree.rs 🔗