Merge pull request #62 from zed-industries/multibyte-characters

Antonio Scandurra created

Get randomized tests passing in the presence of multibyte chars

Change summary

zed/src/editor/buffer/mod.rs           |  34 +++-
zed/src/editor/buffer/rope.rs          | 170 +++++++++++++++++++++------
zed/src/editor/display_map/fold_map.rs |   7 
zed/src/util.rs                        |  33 +++++
4 files changed, 187 insertions(+), 57 deletions(-)

Detailed changes

zed/src/editor/buffer/mod.rs 🔗

@@ -600,8 +600,7 @@ impl Buffer {
     }
 
     pub fn text_summary_for_range(&self, range: Range<usize>) -> TextSummary {
-        // TODO: Use a dedicated ::summarize method in Rope.
-        self.visible_text.slice(range).summary()
+        self.visible_text.cursor(range.start).summary(range.end)
     }
 
     pub fn len(&self) -> usize {
@@ -2509,12 +2508,12 @@ mod tests {
                 for _i in 0..10 {
                     let (old_ranges, new_text, _) = buffer.randomly_mutate(rng, None);
                     for old_range in old_ranges.iter().rev() {
-                        reference_string = [
-                            &reference_string[0..old_range.start],
-                            new_text.as_str(),
-                            &reference_string[old_range.end..],
-                        ]
-                        .concat();
+                        reference_string = reference_string
+                            .chars()
+                            .take(old_range.start)
+                            .chain(new_text.chars())
+                            .chain(reference_string.chars().skip(old_range.end))
+                            .collect();
                     }
                     assert_eq!(buffer.text(), reference_string);
 
@@ -2549,7 +2548,12 @@ mod tests {
                         let range_sum = buffer.text_summary_for_range(start..end);
                         assert_eq!(range_sum.rightmost_point.column, *longest_column);
                         assert!(longest_rows.contains(&range_sum.rightmost_point.row));
-                        let range_text = &buffer.text()[start..end];
+                        let range_text = buffer
+                            .text()
+                            .chars()
+                            .skip(start)
+                            .take(end - start)
+                            .collect::<String>();
                         assert_eq!(range_sum.chars, range_text.chars().count());
                         assert_eq!(range_sum.bytes, range_text.len());
                     }
@@ -3458,9 +3462,17 @@ mod tests {
 
     fn line_lengths_in_range(buffer: &Buffer, range: Range<usize>) -> BTreeMap<u32, HashSet<u32>> {
         let mut lengths = BTreeMap::new();
-        for (row, line) in buffer.text()[range].lines().enumerate() {
+        for (row, line) in buffer
+            .text()
+            .chars()
+            .skip(range.start)
+            .take(range.len())
+            .collect::<String>()
+            .lines()
+            .enumerate()
+        {
             lengths
-                .entry(line.len() as u32)
+                .entry(line.chars().count() as u32)
                 .or_insert(HashSet::default())
                 .insert(row as u32);
         }

zed/src/editor/buffer/rope.rs 🔗

@@ -1,12 +1,13 @@
 use super::Point;
 use crate::sum_tree::{self, SeekBias, SumTree};
+use crate::util::byte_range_for_char_range;
 use anyhow::{anyhow, Result};
 use arrayvec::ArrayString;
 use smallvec::SmallVec;
-use std::{cmp, ops::Range, str};
+use std::{cmp, iter::Skip, str};
 
 #[cfg(test)]
-const CHUNK_BASE: usize = 2;
+const CHUNK_BASE: usize = 6;
 
 #[cfg(not(test))]
 const CHUNK_BASE: usize = 16;
@@ -25,8 +26,12 @@ impl Rope {
         let mut chunks = rope.chunks.cursor::<(), ()>();
         chunks.next();
         if let Some(chunk) = chunks.item() {
-            self.push(&chunk.0);
-            chunks.next();
+            if self.chunks.last().map_or(false, |c| c.0.len() < CHUNK_BASE)
+                || chunk.0.len() < CHUNK_BASE
+            {
+                self.push(&chunk.0);
+                chunks.next();
+            }
         }
 
         self.chunks.push_tree(chunks.suffix(&()), &());
@@ -58,12 +63,7 @@ impl Rope {
                         let mut text = ArrayString::<[_; 4 * CHUNK_BASE]>::new();
                         text.push_str(&last_chunk.0);
                         text.push_str(&first_new_chunk_ref.0);
-
-                        let mut midpoint = text.len() / 2;
-                        while !text.is_char_boundary(midpoint) {
-                            midpoint += 1;
-                        }
-                        let (left, right) = text.split_at(midpoint);
+                        let (left, right) = text.split_at(find_split_ix(&text));
                         last_chunk.0.clear();
                         last_chunk.0.push_str(left);
                         first_new_chunk_ref.0.clear();
@@ -83,19 +83,16 @@ impl Rope {
         #[cfg(test)]
         {
             // Ensure all chunks except maybe the last one are not underflowing.
+            // Allow some wiggle room for multibyte characters at chunk boundaries.
             let mut chunks = self.chunks.cursor::<(), ()>().peekable();
             while let Some(chunk) = chunks.next() {
                 if chunks.peek().is_some() {
-                    assert!(chunk.0.len() >= CHUNK_BASE);
+                    assert!(chunk.0.len() + 3 >= CHUNK_BASE);
                 }
             }
         }
     }
 
-    pub fn slice(&self, range: Range<usize>) -> Rope {
-        self.cursor(range.start).slice(range.end)
-    }
-
     pub fn summary(&self) -> TextSummary {
         self.chunks.summary()
     }
@@ -139,12 +136,14 @@ impl Rope {
     }
 
     pub fn to_offset(&self, point: Point) -> Result<usize> {
-        // TODO: Verify the point actually exists.
         if point <= self.summary().lines {
             let mut cursor = self.chunks.cursor::<Point, TextSummary>();
             cursor.seek(&point, SeekBias::Left, &());
             let overshoot = point - cursor.start().lines;
-            Ok(cursor.start().chars + cursor.item().map_or(0, |chunk| chunk.to_offset(overshoot)))
+            Ok(cursor.start().chars
+                + cursor
+                    .item()
+                    .map_or(Ok(0), |chunk| chunk.to_offset(overshoot))?)
         } else {
             Err(anyhow!("offset out of bounds"))
         }
@@ -190,7 +189,8 @@ impl<'a> Cursor<'a> {
         if let Some(start_chunk) = self.chunks.item() {
             let start_ix = self.offset - self.chunks.start();
             let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start();
-            slice.push(&start_chunk.0[start_ix..end_ix]);
+            let byte_range = byte_range_for_char_range(start_chunk.0, start_ix..end_ix);
+            slice.push(&start_chunk.0[byte_range]);
         }
 
         if end_offset > self.chunks.end() {
@@ -199,7 +199,9 @@ impl<'a> Cursor<'a> {
                 chunks: self.chunks.slice(&end_offset, SeekBias::Right, &()),
             });
             if let Some(end_chunk) = self.chunks.item() {
-                slice.push(&end_chunk.0[..end_offset - self.chunks.start()]);
+                let end_ix = end_offset - self.chunks.start();
+                let byte_range = byte_range_for_char_range(end_chunk.0, 0..end_ix);
+                slice.push(&end_chunk.0[byte_range]);
             }
         }
 
@@ -207,6 +209,30 @@ impl<'a> Cursor<'a> {
         slice
     }
 
+    pub fn summary(&mut self, end_offset: usize) -> TextSummary {
+        debug_assert!(end_offset >= self.offset);
+
+        let mut summary = TextSummary::default();
+        if let Some(start_chunk) = self.chunks.item() {
+            let start_ix = self.offset - self.chunks.start();
+            let end_ix = cmp::min(end_offset, self.chunks.end()) - self.chunks.start();
+            let byte_range = byte_range_for_char_range(start_chunk.0, start_ix..end_ix);
+            summary = TextSummary::from(&start_chunk.0[byte_range]);
+        }
+
+        if end_offset > self.chunks.end() {
+            self.chunks.next();
+            summary += &self.chunks.summary(&end_offset, SeekBias::Right, &());
+            if let Some(end_chunk) = self.chunks.item() {
+                let end_ix = end_offset - self.chunks.start();
+                let byte_range = byte_range_for_char_range(end_chunk.0, 0..end_ix);
+                summary += TextSummary::from(&end_chunk.0[byte_range]);
+            }
+        }
+
+        summary
+    }
+
     pub fn suffix(mut self) -> Rope {
         self.slice(self.rope.chunks.extent())
     }
@@ -239,7 +265,7 @@ impl Chunk {
         point
     }
 
-    fn to_offset(&self, target: Point) -> usize {
+    fn to_offset(&self, target: Point) -> Result<usize> {
         let mut offset = 0;
         let mut point = Point::new(0, 0);
         for ch in self.0.chars() {
@@ -255,7 +281,12 @@ impl Chunk {
             }
             offset += 1;
         }
-        offset
+
+        if point == target {
+            Ok(offset)
+        } else {
+            Err(anyhow!("point out of bounds"))
+        }
     }
 }
 
@@ -263,12 +294,27 @@ impl sum_tree::Item for Chunk {
     type Summary = TextSummary;
 
     fn summary(&self) -> Self::Summary {
+        TextSummary::from(self.0.as_str())
+    }
+}
+
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct TextSummary {
+    pub chars: usize,
+    pub bytes: usize,
+    pub lines: Point,
+    pub first_line_len: u32,
+    pub rightmost_point: Point,
+}
+
+impl<'a> From<&'a str> for TextSummary {
+    fn from(text: &'a str) -> Self {
         let mut chars = 0;
         let mut bytes = 0;
         let mut lines = Point::new(0, 0);
         let mut first_line_len = 0;
         let mut rightmost_point = Point::new(0, 0);
-        for c in self.0.chars() {
+        for c in text.chars() {
             chars += 1;
             bytes += c.len_utf8();
             if c == '\n' {
@@ -295,15 +341,6 @@ impl sum_tree::Item for Chunk {
     }
 }
 
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
-pub struct TextSummary {
-    pub chars: usize,
-    pub bytes: usize,
-    pub lines: Point,
-    pub first_line_len: u32,
-    pub rightmost_point: Point,
-}
-
 impl sum_tree::Summary for TextSummary {
     type Context = ();
 
@@ -358,19 +395,19 @@ impl<'a> sum_tree::Dimension<'a, TextSummary> for Point {
 
 pub struct Chars<'a> {
     cursor: sum_tree::Cursor<'a, Chunk, usize, usize>,
-    chars: str::Chars<'a>,
+    chars: Skip<str::Chars<'a>>,
 }
 
 impl<'a> Chars<'a> {
     pub fn new(rope: &'a Rope, start: usize) -> Self {
         let mut cursor = rope.chunks.cursor::<usize, usize>();
-        cursor.slice(&start, SeekBias::Left, &());
+        cursor.seek(&start, SeekBias::Left, &());
         let chars = if let Some(chunk) = cursor.item() {
             let ix = start - cursor.start();
             cursor.next();
-            chunk.0[ix..].chars()
+            chunk.0.chars().skip(ix)
         } else {
-            "".chars()
+            "".chars().skip(0)
         };
 
         Self { cursor, chars }
@@ -384,7 +421,7 @@ impl<'a> Iterator for Chars<'a> {
         if let Some(ch) = self.chars.next() {
             Some(ch)
         } else if let Some(chunk) = self.cursor.item() {
-            self.chars = chunk.0.chars();
+            self.chars = chunk.0.chars().skip(0);
             self.cursor.next();
             Some(self.chars.next().unwrap())
         } else {
@@ -393,6 +430,25 @@ impl<'a> Iterator for Chars<'a> {
     }
 }
 
+fn find_split_ix(text: &str) -> usize {
+    let mut ix = text.len() / 2;
+    while !text.is_char_boundary(ix) {
+        if ix < 2 * CHUNK_BASE {
+            ix += 1;
+        } else {
+            ix = (text.len() / 2) - 1;
+            break;
+        }
+    }
+    while !text.is_char_boundary(ix) {
+        ix -= 1;
+    }
+
+    debug_assert!(ix <= 2 * CHUNK_BASE);
+    debug_assert!(text.len() - ix <= 2 * CHUNK_BASE);
+    ix
+}
+
 #[cfg(test)]
 mod tests {
     use crate::util::RandomCharIter;
@@ -401,6 +457,14 @@ mod tests {
     use rand::prelude::*;
     use std::env;
 
+    #[test]
+    fn test_all_4_byte_chars() {
+        let mut rope = Rope::new();
+        let text = "🏀".repeat(256);
+        rope.push(&text);
+        assert_eq!(rope.text(), text);
+    }
+
     #[test]
     fn test_random() {
         let iterations = env::var("ITERATIONS")
@@ -422,9 +486,9 @@ mod tests {
             let mut expected = String::new();
             let mut actual = Rope::new();
             for _ in 0..operations {
-                let end_ix = rng.gen_range(0..=expected.len());
+                let end_ix = rng.gen_range(0..=expected.chars().count());
                 let start_ix = rng.gen_range(0..=end_ix);
-                let len = rng.gen_range(0..=20);
+                let len = rng.gen_range(0..=64);
                 let new_text: String = RandomCharIter::new(&mut rng).take(len).collect();
 
                 let mut new_actual = Rope::new();
@@ -436,16 +500,20 @@ mod tests {
                 actual = new_actual;
 
                 let mut new_expected = String::new();
-                new_expected.push_str(&expected[..start_ix]);
+                new_expected.extend(expected.chars().take(start_ix));
                 new_expected.push_str(&new_text);
-                new_expected.push_str(&expected[end_ix..]);
+                new_expected.extend(expected.chars().skip(end_ix));
                 expected = new_expected;
 
                 assert_eq!(actual.text(), expected);
+                log::info!("text: {:?}", expected);
 
                 for _ in 0..5 {
-                    let ix = rng.gen_range(0..=expected.len());
-                    assert_eq!(actual.chars_at(ix).collect::<String>(), expected[ix..]);
+                    let ix = rng.gen_range(0..=expected.chars().count());
+                    assert_eq!(
+                        actual.chars_at(ix).collect::<String>(),
+                        expected.chars().skip(ix).collect::<String>()
+                    );
                 }
 
                 let mut point = Point::new(0, 0);
@@ -454,6 +522,10 @@ mod tests {
                     assert_eq!(actual.to_point(offset).unwrap(), point);
                     assert_eq!(actual.to_offset(point).unwrap(), offset);
                     if ch == '\n' {
+                        assert!(actual
+                            .to_offset(Point::new(point.row, point.column + 1))
+                            .is_err());
+
                         point.row += 1;
                         point.column = 0
                     } else {
@@ -461,6 +533,20 @@ mod tests {
                     }
                     offset += 1;
                 }
+                assert_eq!(actual.to_point(offset).unwrap(), point);
+                assert!(actual.to_point(offset + 1).is_err());
+                assert_eq!(actual.to_offset(point).unwrap(), offset);
+                assert!(actual.to_offset(Point::new(point.row + 1, 0)).is_err());
+
+                for _ in 0..5 {
+                    let end_ix = rng.gen_range(0..=expected.chars().count());
+                    let start_ix = rng.gen_range(0..=end_ix);
+                    let byte_range = byte_range_for_char_range(&expected, start_ix..end_ix);
+                    assert_eq!(
+                        actual.cursor(start_ix).summary(end_ix),
+                        TextSummary::from(&expected[byte_range])
+                    );
+                }
             }
         }
     }

zed/src/editor/display_map/fold_map.rs 🔗

@@ -830,7 +830,7 @@ mod tests {
     #[gpui::test]
     fn test_random_folds(app: &mut gpui::MutableAppContext) {
         use crate::editor::ToPoint;
-        use crate::util::RandomCharIter;
+        use crate::util::{byte_range_for_char_range, RandomCharIter};
         use rand::prelude::*;
         use std::env;
 
@@ -905,7 +905,10 @@ mod tests {
                     expected_buffer_rows.extend((fold_end.row + 1..=next_row).rev());
                     next_row = fold_start.row;
 
-                    expected_text.replace_range(fold_range.start..fold_range.end, "…");
+                    expected_text.replace_range(
+                        byte_range_for_char_range(&expected_text, fold_range.start..fold_range.end),
+                        "…",
+                    );
                 }
                 expected_buffer_rows.extend((0..=next_row).rev());
                 expected_buffer_rows.reverse();

zed/src/util.rs 🔗

@@ -1,5 +1,20 @@
 use rand::prelude::*;
-use std::cmp::Ordering;
+use std::{cmp::Ordering, ops::Range};
+
+pub fn byte_range_for_char_range(text: impl AsRef<str>, char_range: Range<usize>) -> Range<usize> {
+    let text = text.as_ref();
+    let mut result = text.len()..text.len();
+    for (i, (offset, _)) in text.char_indices().enumerate() {
+        if i == char_range.start {
+            result.start = offset;
+        }
+        if i == char_range.end {
+            result.end = offset;
+            break;
+        }
+    }
+    result
+}
 
 pub fn post_inc(value: &mut usize) -> usize {
     let prev = *value;
@@ -44,7 +59,21 @@ impl<T: Rng> Iterator for RandomCharIter<T> {
     fn next(&mut self) -> Option<Self::Item> {
         if self.0.gen_bool(1.0 / 5.0) {
             Some('\n')
-        } else {
+        }
+        // two-byte greek letters
+        else if self.0.gen_bool(1.0 / 8.0) {
+            Some(std::char::from_u32(self.0.gen_range(('α' as u32)..('ω' as u32 + 1))).unwrap())
+        }
+        // three-byte characters
+        else if self.0.gen_bool(1.0 / 10.0) {
+            ['✋', '✅', '❌', '❎', '⭐'].choose(&mut self.0).cloned()
+        }
+        // four-byte characters
+        else if self.0.gen_bool(1.0 / 12.0) {
+            ['🍐', '🏀', '🍗', '🎉'].choose(&mut self.0).cloned()
+        }
+        // ascii letters
+        else {
             Some(self.0.gen_range(b'a'..b'z' + 1).into())
         }
     }