diff --git a/crates/rope/src/chunk.rs b/crates/rope/src/chunk.rs index 6b1f290852bb7918d562a9c9110433ffeaf2fed9..91e61a517144f5b2408902173a8c34ccb865e9d5 100644 --- a/crates/rope/src/chunk.rs +++ b/crates/rope/src/chunk.rs @@ -10,9 +10,19 @@ pub(crate) const MAX_BASE: usize = MIN_BASE * 2; #[derive(Clone, Debug, Default)] pub struct Chunk { + /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the + /// text. chars: u128, + /// The number of set bits is the number of UTF-16 code units it would take to represent the + /// text. + /// + /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would + /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more + /// than two UTF-16 code units.) chars_utf16: u128, + /// If bit[i] is set, then the character at index i is an ascii newline. newlines: u128, + /// If bit[i] is set, then the character at index i is an ascii tab. pub tabs: u128, pub text: ArrayString, } @@ -144,6 +154,12 @@ impl<'a> ChunkSlice<'a> { let mask = if range.end == MAX_BASE { u128::MAX } else { + debug_assert!( + self.is_char_boundary(range.end), + "Invalid range end {} in {:?}", + range.end, + self + ); (1u128 << range.end) - 1 }; if range.start == MAX_BASE { @@ -155,6 +171,12 @@ impl<'a> ChunkSlice<'a> { text: "", } } else { + debug_assert!( + self.is_char_boundary(range.start), + "Invalid range start {} in {:?}", + range.start, + self + ); Self { chars: (self.chars & mask) >> range.start, chars_utf16: (self.chars_utf16 & mask) >> range.start, @@ -617,19 +639,17 @@ mod tests { #[gpui::test(iterations = 100)] fn test_random_chunks(mut rng: StdRng) { - let chunk_len = rng.random_range(0..=MAX_BASE); - let text = RandomCharIter::new(&mut rng) - .take(chunk_len) - .collect::(); - let mut ix = chunk_len; - while !text.is_char_boundary(ix) { - ix -= 1; - } - let text = &text[..ix]; - + let text = random_string_with_utf8_len(&mut rng, MAX_BASE); log::info!("Chunk: {:?}", text); - let chunk = Chunk::new(text); - verify_chunk(chunk.as_slice(), text); + let chunk = Chunk::new(&text); + verify_chunk(chunk.as_slice(), &text); + + // Verify Chunk::chars() bitmap + let expected_chars = char_offsets(&text) + .into_iter() + .inspect(|i| assert!(*i < 128)) + .fold(0u128, |acc, i| acc | (1 << i)); + assert_eq!(chunk.chars(), expected_chars); for _ in 0..10 { let mut start = rng.random_range(0..=chunk.text.len()); @@ -648,6 +668,20 @@ mod tests { } } + #[gpui::test(iterations = 100)] + fn test_split_chunk_slice(mut rng: StdRng) { + let text = &random_string_with_utf8_len(&mut rng, MAX_BASE); + let chunk = Chunk::new(text); + let offset = char_offsets_with_end(text) + .into_iter() + .choose(&mut rng) + .unwrap(); + let (a, b) = chunk.as_slice().split_at(offset); + let (a_str, b_str) = text.split_at(offset); + verify_chunk(a, a_str); + verify_chunk(b, b_str); + } + #[gpui::test(iterations = 1000)] fn test_nth_set_bit_random(mut rng: StdRng) { let set_count = rng.random_range(0..=128); @@ -670,6 +704,51 @@ mod tests { } } + /// Returns a (biased) random string whose UTF-8 length is no more than `len`. + fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String { + let mut str = String::new(); + let mut chars = RandomCharIter::new(rng); + loop { + let ch = chars.next().unwrap(); + if str.len() + ch.len_utf8() > len { + break; + } + str.push(ch); + } + str + } + + #[gpui::test(iterations = 1000)] + fn test_append_random_strings(mut rng: StdRng) { + let len1 = rng.random_range(0..=MAX_BASE); + let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1); + let str1 = random_string_with_utf8_len(&mut rng, len1); + let str2 = random_string_with_utf8_len(&mut rng, len2); + let mut chunk1 = Chunk::new(&str1); + let chunk2 = Chunk::new(&str2); + let char_offsets = char_offsets_with_end(&str2); + let start_index = rng.random_range(0..char_offsets.len()); + let start_offset = char_offsets[start_index]; + let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())]; + chunk1.append(chunk2.slice(start_offset..end_offset)); + verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset])); + } + + /// Return the byte offsets for each character in a string. + /// + /// These are valid offsets to split the string. + fn char_offsets(text: &str) -> Vec { + text.char_indices().map(|(i, _c)| i).collect() + } + + /// Return the byte offsets for each character in a string, plus the offset + /// past the end of the string. + fn char_offsets_with_end(text: &str) -> Vec { + let mut v = char_offsets(text); + v.push(text.len()); + v + } + fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) { let mut offset = 0; let mut offset_utf16 = OffsetUtf16(0);