diff --git a/crates/worktree/src/worktree.rs b/crates/worktree/src/worktree.rs index 44ba4e752cff778b7918b9a29935d0f0e1ebb614..46457982c91b2fe4a0cc5b05548ebc0b00a9f787 100644 --- a/crates/worktree/src/worktree.rs +++ b/crates/worktree/src/worktree.rs @@ -6061,7 +6061,7 @@ fn decode_byte_full( } } -#[derive(PartialEq)] +#[derive(Debug, PartialEq)] enum ByteContent { Utf16Le, Utf16Be, @@ -6117,13 +6117,24 @@ fn analyze_byte_content(bytes: &[u8]) -> ByteContent { return ByteContent::Unknown; } - if total_null_count >= limit / 16 { - if even_null_count > odd_null_count * 4 { + let has_significant_nulls = total_null_count >= limit / 16; + let nulls_skew_to_even = even_null_count > odd_null_count * 4; + let nulls_skew_to_odd = odd_null_count > even_null_count * 4; + + if has_significant_nulls { + let sample = &bytes[..limit]; + + // UTF-16BE ASCII: [0x00, char] — nulls at even positions (high byte first) + // UTF-16LE ASCII: [char, 0x00] — nulls at odd positions (low byte first) + + if nulls_skew_to_even && is_plausible_utf16_text(sample, false) { return ByteContent::Utf16Be; } - if odd_null_count > even_null_count * 4 { + + if nulls_skew_to_odd && is_plausible_utf16_text(sample, true) { return ByteContent::Utf16Le; } + return ByteContent::Binary; } @@ -6145,4 +6156,208 @@ fn is_known_binary_header(bytes: &[u8]) -> bool { || bytes.starts_with(b"GIF89a") // GIF89a || bytes.starts_with(b"IWAD") // Doom IWAD archive || bytes.starts_with(b"PWAD") // Doom PWAD archive + || bytes.starts_with(b"RIFF") // WAV, AVI, WebP + || bytes.starts_with(b"OggS") // OGG (Vorbis, Opus, FLAC) + || bytes.starts_with(b"fLaC") // FLAC + || bytes.starts_with(b"ID3") // MP3 with ID3v2 tag + || bytes.starts_with(b"\xFF\xFB") // MP3 frame sync (MPEG1 Layer3) + || bytes.starts_with(b"\xFF\xFA") // MP3 frame sync (MPEG1 Layer3) + || bytes.starts_with(b"\xFF\xF3") // MP3 frame sync (MPEG2 Layer3) + || bytes.starts_with(b"\xFF\xF2") // MP3 frame sync (MPEG2 Layer3) +} + +// Null byte skew alone is not enough to identify UTF-16 -- binary formats with +// small 16-bit values (like PCM audio) produce the same pattern. Decode the +// bytes as UTF-16 and reject if too many code units land in control character +// ranges or form unpaired surrogates, which real text almost never contains. +fn is_plausible_utf16_text(bytes: &[u8], little_endian: bool) -> bool { + let mut suspicious_count = 0usize; + let mut total = 0usize; + + let mut i = 0; + while let Some(code_unit) = read_u16(bytes, i, little_endian) { + total += 1; + + match code_unit { + 0x0009 | 0x000A | 0x000C | 0x000D => {} + // C0/C1 control characters and non-characters + 0x0000..=0x001F | 0x007F..=0x009F | 0xFFFE | 0xFFFF => suspicious_count += 1, + 0xD800..=0xDBFF => { + let next_offset = i + 2; + let has_low_surrogate = read_u16(bytes, next_offset, little_endian) + .is_some_and(|next| (0xDC00..=0xDFFF).contains(&next)); + if has_low_surrogate { + total += 1; + i += 2; + } else { + suspicious_count += 1; + } + } + // Lone low surrogate without a preceding high surrogate + 0xDC00..=0xDFFF => suspicious_count += 1, + _ => {} + } + + i += 2; + } + + if total == 0 { + return false; + } + + // Real UTF-16 text has near-zero control characters; binary data with + // small 16-bit values typically exceeds 5%. 2% provides a safe margin. + suspicious_count * 100 < total * 2 +} + +fn read_u16(bytes: &[u8], offset: usize, little_endian: bool) -> Option { + let pair = [*bytes.get(offset)?, *bytes.get(offset + 1)?]; + if little_endian { + return Some(u16::from_le_bytes(pair)); + } + Some(u16::from_be_bytes(pair)) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// reproduction of issue #50785 + fn build_pcm16_wav_bytes() -> Vec { + let header: Vec = vec![ + /* RIFF header */ + 0x52, 0x49, 0x46, 0x46, // "RIFF" + 0xc6, 0xcf, 0x00, 0x00, // file size: 8 + 0x57, 0x41, 0x56, 0x45, // "WAVE" + /* fmt chunk */ + 0x66, 0x6d, 0x74, 0x20, // "fmt " + 0x10, 0x00, 0x00, 0x00, // chunk size: 16 + 0x01, 0x00, // format: PCM (1) + 0x01, 0x00, // channels: 1 (mono) + 0x80, 0x3e, 0x00, 0x00, // sample rate: 16000 + 0x00, 0x7d, 0x00, 0x00, // byte rate: 32000 + 0x02, 0x00, // block align: 2 + 0x10, 0x00, // bits per sample: 16 + /* LIST chunk */ + 0x4c, 0x49, 0x53, 0x54, // "LIST" + 0x1a, 0x00, 0x00, 0x00, // chunk size: 26 + 0x49, 0x4e, 0x46, 0x4f, // "INFO" + 0x49, 0x53, 0x46, 0x54, // "ISFT" + 0x0d, 0x00, 0x00, 0x00, // sub-chunk size: 13 + 0x4c, 0x61, 0x76, 0x66, 0x36, 0x32, 0x2e, 0x33, // "Lavf62.3" + 0x2e, 0x31, 0x30, 0x30, 0x00, // ".100\0" + /* padding byte for word alignment */ + 0x00, // data chunk header + 0x64, 0x61, 0x74, 0x61, // "data" + 0x80, 0xcf, 0x00, 0x00, // chunk size + ]; + + let mut bytes = header; + + // fill remaining space up to `FILE_ANALYSIS_BYTES` with synthetic PCM + let audio_bytes_needed = FILE_ANALYSIS_BYTES - bytes.len(); + for i in 0..(audio_bytes_needed / 2) { + let sample = (i & 0xFF) as u8; + bytes.push(sample); // low byte: varies + bytes.push(0x00); // high byte: zero for small values + } + + bytes + } + + #[test] + fn test_pcm16_wav_detected_as_binary() { + let wav_bytes = build_pcm16_wav_bytes(); + assert_eq!(wav_bytes.len(), FILE_ANALYSIS_BYTES); + + let result = analyze_byte_content(&wav_bytes); + assert_eq!( + result, + ByteContent::Binary, + "PCM 16-bit WAV should be detected as Binary via RIFF header" + ); + } + + #[test] + fn test_le16_binary_not_misdetected_as_utf16le() { + let mut bytes = b"FAKE".to_vec(); + while bytes.len() < FILE_ANALYSIS_BYTES { + let sample = (bytes.len() & 0xFF) as u8; + bytes.push(sample); + bytes.push(0x00); + } + bytes.truncate(FILE_ANALYSIS_BYTES); + + let result = analyze_byte_content(&bytes); + assert_eq!( + result, + ByteContent::Binary, + "LE 16-bit binary with control characters should be detected as Binary" + ); + } + + #[test] + fn test_be16_binary_not_misdetected_as_utf16be() { + let mut bytes = b"FAKE".to_vec(); + while bytes.len() < FILE_ANALYSIS_BYTES { + bytes.push(0x00); + let sample = (bytes.len() & 0xFF) as u8; + bytes.push(sample); + } + bytes.truncate(FILE_ANALYSIS_BYTES); + + let result = analyze_byte_content(&bytes); + assert_eq!( + result, + ByteContent::Binary, + "BE 16-bit binary with control characters should be detected as Binary" + ); + } + + #[test] + fn test_utf16le_text_detected_as_utf16le() { + let text = "Hello, world! This is a UTF-16 test string. "; + let mut bytes = Vec::new(); + while bytes.len() < FILE_ANALYSIS_BYTES { + bytes.extend(text.encode_utf16().flat_map(|u| u.to_le_bytes())); + } + bytes.truncate(FILE_ANALYSIS_BYTES); + + assert_eq!(analyze_byte_content(&bytes), ByteContent::Utf16Le); + } + + #[test] + fn test_utf16be_text_detected_as_utf16be() { + let text = "Hello, world! This is a UTF-16 test string. "; + let mut bytes = Vec::new(); + while bytes.len() < FILE_ANALYSIS_BYTES { + bytes.extend(text.encode_utf16().flat_map(|u| u.to_be_bytes())); + } + bytes.truncate(FILE_ANALYSIS_BYTES); + + assert_eq!(analyze_byte_content(&bytes), ByteContent::Utf16Be); + } + + #[test] + fn test_known_binary_headers() { + let cases: &[(&[u8], &str)] = &[ + (b"RIFF\x00\x00\x00\x00WAVE", "WAV"), + (b"RIFF\x00\x00\x00\x00AVI ", "AVI"), + (b"OggS\x00\x02", "OGG"), + (b"fLaC\x00\x00", "FLAC"), + (b"ID3\x03\x00", "MP3 ID3v2"), + (b"\xFF\xFB\x90\x00", "MP3 MPEG1 Layer3"), + (b"\xFF\xF3\x90\x00", "MP3 MPEG2 Layer3"), + ]; + + for (header, label) in cases { + let mut bytes = header.to_vec(); + bytes.resize(FILE_ANALYSIS_BYTES, 0x41); // pad with 'A' + assert_eq!( + analyze_byte_content(&bytes), + ByteContent::Binary, + "{label} should be detected as Binary" + ); + } + } }