text: Assert text anchor offset validity on construction (#38441)

Lukas Wirth and Mikayla Maki created

Attempt to aid debugging some utf8 indexing issues

Release Notes:

- N/A

Co-authored-by: Mikayla Maki <mikayla@zed.dev>

Change summary

crates/editor/src/highlight_matching_bracket.rs |   4 
crates/language/src/buffer.rs                   |   6 
crates/multi_buffer/src/multi_buffer.rs         |   3 
crates/rope/src/rope.rs                         | 167 +++++++++++++++++++
crates/text/src/text.rs                         |  21 ++
5 files changed, 195 insertions(+), 6 deletions(-)

Detailed changes

crates/editor/src/highlight_matching_bracket.rs šŸ”—

@@ -29,7 +29,9 @@ pub fn refresh_matching_bracket_highlights(
     if (editor.cursor_shape == CursorShape::Block || editor.cursor_shape == CursorShape::Hollow)
         && head < snapshot.buffer_snapshot.len()
     {
-        tail += 1;
+        if let Some(tail_ch) = snapshot.buffer_snapshot.chars_at(tail).next() {
+            tail += tail_ch.len_utf8();
+        }
     }
 
     if let Some((opening_range, closing_range)) = snapshot

crates/language/src/buffer.rs šŸ”—

@@ -4121,8 +4121,7 @@ impl BufferSnapshot {
         range: Range<T>,
     ) -> impl Iterator<Item = BracketMatch> + '_ {
         // Find bracket pairs that *inclusively* contain the given range.
-        let range = range.start.to_offset(self).saturating_sub(1)
-            ..self.len().min(range.end.to_offset(self) + 1);
+        let range = range.start.to_previous_offset(self)..range.end.to_next_offset(self);
         self.all_bracket_ranges(range)
             .filter(|pair| !pair.newline_only)
     }
@@ -4131,8 +4130,7 @@ impl BufferSnapshot {
         &self,
         range: Range<T>,
     ) -> impl Iterator<Item = (Range<usize>, DebuggerTextObject)> + '_ {
-        let range = range.start.to_offset(self).saturating_sub(1)
-            ..self.len().min(range.end.to_offset(self) + 1);
+        let range = range.start.to_previous_offset(self)..range.end.to_next_offset(self);
 
         let mut matches = self.syntax.matches_with_options(
             range.clone(),

crates/multi_buffer/src/multi_buffer.rs šŸ”—

@@ -5259,7 +5259,8 @@ impl MultiBufferSnapshot {
             } else {
                 Anchor::max()
             };
-            // TODO this is a hack, remove it
+
+            // TODO this is a hack, because all APIs should be able to handle ExcerptId::min and max.
             if let Some((excerpt_id, _, _)) = self.as_singleton() {
                 anchor.excerpt_id = *excerpt_id;
             }

crates/rope/src/rope.rs šŸ”—

@@ -30,6 +30,76 @@ impl Rope {
         Self::default()
     }
 
+    pub fn is_char_boundary(&self, offset: usize) -> bool {
+        if self.chunks.is_empty() {
+            return offset == 0;
+        }
+        let mut cursor = self.chunks.cursor::<usize>(&());
+        cursor.seek(&offset, Bias::Left);
+        let chunk_offset = offset - cursor.start();
+        cursor
+            .item()
+            .map(|chunk| chunk.text.is_char_boundary(chunk_offset))
+            .unwrap_or(false)
+    }
+
+    pub fn floor_char_boundary(&self, index: usize) -> usize {
+        if index >= self.len() {
+            self.len()
+        } else {
+            #[inline]
+            pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
+                // This is bit magic equivalent to: b < 128 || b >= 192
+                (u8 as i8) >= -0x40
+            }
+
+            let mut cursor = self.chunks.cursor::<usize>(&());
+            cursor.seek(&index, Bias::Left);
+            let chunk_offset = index - cursor.start();
+            let lower_idx = cursor.item().map(|chunk| {
+                let lower_bound = chunk_offset.saturating_sub(3);
+                chunk
+                    .text
+                    .as_bytes()
+                    .get(lower_bound..=chunk_offset)
+                    .map(|it| {
+                        let new_idx = it
+                            .iter()
+                            .rposition(|&b| is_utf8_char_boundary(b))
+                            .unwrap_or(0);
+                        lower_bound + new_idx
+                    })
+                    .unwrap_or(chunk.text.len())
+            });
+            lower_idx.map_or_else(|| self.len(), |idx| cursor.start() + idx)
+        }
+    }
+
+    pub fn ceil_char_boundary(&self, index: usize) -> usize {
+        if index > self.len() {
+            self.len()
+        } else {
+            #[inline]
+            pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
+                // This is bit magic equivalent to: b < 128 || b >= 192
+                (u8 as i8) >= -0x40
+            }
+
+            let mut cursor = self.chunks.cursor::<usize>(&());
+            cursor.seek(&index, Bias::Left);
+            let chunk_offset = index - cursor.start();
+            let upper_idx = cursor.item().map(|chunk| {
+                let upper_bound = Ord::min(chunk_offset + 4, chunk.text.len());
+                chunk.text.as_bytes()[chunk_offset..upper_bound]
+                    .iter()
+                    .position(|&b| is_utf8_char_boundary(b))
+                    .map_or(upper_bound, |pos| pos + chunk_offset)
+            });
+
+            upper_idx.map_or_else(|| self.len(), |idx| cursor.start() + idx)
+        }
+    }
+
     pub fn append(&mut self, rope: Rope) {
         if let Some(chunk) = rope.chunks.first()
             && (self
@@ -2069,6 +2139,103 @@ mod tests {
         assert!(!rope.reversed_chunks_in_range(0..0).equals_str("foo"));
     }
 
+    #[test]
+    fn test_is_char_boundary() {
+        let fixture = "地";
+        let rope = Rope::from("地");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.is_char_boundary(b), fixture.is_char_boundary(b));
+        }
+        let fixture = "";
+        let rope = Rope::from("");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.is_char_boundary(b), fixture.is_char_boundary(b));
+        }
+        let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©";
+        let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.is_char_boundary(b), fixture.is_char_boundary(b));
+        }
+    }
+
+    #[test]
+    fn test_floor_char_boundary() {
+        // polyfill of str::floor_char_boundary
+        fn floor_char_boundary(str: &str, index: usize) -> usize {
+            if index >= str.len() {
+                str.len()
+            } else {
+                let lower_bound = index.saturating_sub(3);
+                let new_index = str.as_bytes()[lower_bound..=index]
+                    .iter()
+                    .rposition(|b| (*b as i8) >= -0x40);
+
+                lower_bound + new_index.unwrap()
+            }
+        }
+
+        let fixture = "地";
+        let rope = Rope::from("地");
+        for b in 0..=fixture.len() {
+            assert_eq!(
+                rope.floor_char_boundary(b),
+                floor_char_boundary(&fixture, b)
+            );
+        }
+
+        let fixture = "";
+        let rope = Rope::from("");
+        for b in 0..=fixture.len() {
+            assert_eq!(
+                rope.floor_char_boundary(b),
+                floor_char_boundary(&fixture, b)
+            );
+        }
+
+        let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©";
+        let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©");
+        for b in 0..=fixture.len() {
+            assert_eq!(
+                rope.floor_char_boundary(b),
+                floor_char_boundary(&fixture, b)
+            );
+        }
+    }
+
+    #[test]
+    fn test_ceil_char_boundary() {
+        // polyfill of str::ceil_char_boundary
+        fn ceil_char_boundary(str: &str, index: usize) -> usize {
+            if index > str.len() {
+                str.len()
+            } else {
+                let upper_bound = Ord::min(index + 4, str.len());
+                str.as_bytes()[index..upper_bound]
+                    .iter()
+                    .position(|b| (*b as i8) >= -0x40)
+                    .map_or(upper_bound, |pos| pos + index)
+            }
+        }
+
+        let fixture = "地";
+        let rope = Rope::from("地");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+        }
+
+        let fixture = "";
+        let rope = Rope::from("");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+        }
+
+        let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©";
+        let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©");
+        for b in 0..=fixture.len() {
+            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+        }
+    }
+
     fn clip_offset(text: &str, mut offset: usize, bias: Bias) -> usize {
         while !text.is_char_boundary(offset) {
             match bias {

crates/text/src/text.rs šŸ”—

@@ -2400,6 +2400,17 @@ impl BufferSnapshot {
         } else if bias == Bias::Right && offset == self.len() {
             Anchor::MAX
         } else {
+            if !self.visible_text.is_char_boundary(offset) {
+                // find the character
+                let char_start = self.visible_text.floor_char_boundary(offset);
+                // `char_start` must be less than len and a char boundary
+                let ch = self.visible_text.chars_at(char_start).next().unwrap();
+                let char_range = char_start..char_start + ch.len_utf8();
+                panic!(
+                    "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
+                    offset, ch, char_range,
+                );
+            }
             let mut fragment_cursor = self.fragments.cursor::<usize>(&None);
             fragment_cursor.seek(&offset, bias);
             let fragment = fragment_cursor.item().unwrap();
@@ -3065,6 +3076,16 @@ impl operation_queue::Operation for Operation {
 
 pub trait ToOffset {
     fn to_offset(&self, snapshot: &BufferSnapshot) -> usize;
+    fn to_next_offset(&self, snapshot: &BufferSnapshot) -> usize {
+        snapshot
+            .visible_text
+            .ceil_char_boundary(self.to_offset(snapshot) + 1)
+    }
+    fn to_previous_offset(&self, snapshot: &BufferSnapshot) -> usize {
+        snapshot
+            .visible_text
+            .floor_char_boundary(self.to_offset(snapshot).saturating_sub(1))
+    }
 }
 
 impl ToOffset for Point {