Fix moving to next word boundary with multi-byte characters

Antonio Scandurra created

Previously, for a given point, we would create a char iterator at
the start of the row and the skip `column` characters. This is
however incorrect because display points are expressed in bytes,
and so we could park the anchor midway through a multi-byte character.

This commit fixes the issue by switching `DisplayMap::chars_at` to
take a point instead and skipping characters correctly when a point with
a non-zero column is provided.

Change summary

zed/src/editor/display_map.rs | 19 ++++++++--
zed/src/editor/movement.rs    | 62 +++++++++++++++++++++++++++++++++++-
2 files changed, 74 insertions(+), 7 deletions(-)

Detailed changes

zed/src/editor/display_map.rs 🔗

@@ -159,14 +159,23 @@ impl DisplayMapSnapshot {
             .highlighted_chunks_for_rows(display_rows)
     }
 
-    pub fn chars_at<'a>(&'a self, display_row: u32) -> impl Iterator<Item = char> + 'a {
-        self.chunks_at(display_row).flat_map(str::chars)
+    pub fn chars_at<'a>(&'a self, point: DisplayPoint) -> impl Iterator<Item = char> + 'a {
+        let mut column = 0;
+        let mut chars = self.chunks_at(point.row()).flat_map(str::chars);
+        while column < point.column() {
+            if let Some(c) = chars.next() {
+                column += c.len_utf8() as u32;
+            } else {
+                break;
+            }
+        }
+        chars
     }
 
     pub fn column_to_chars(&self, display_row: u32, target: u32) -> u32 {
         let mut count = 0;
         let mut column = 0;
-        for c in self.chars_at(display_row) {
+        for c in self.chars_at(DisplayPoint::new(display_row, 0)) {
             if column >= target {
                 break;
             }
@@ -179,7 +188,7 @@ impl DisplayMapSnapshot {
     pub fn column_from_chars(&self, display_row: u32, char_count: u32) -> u32 {
         let mut count = 0;
         let mut column = 0;
-        for c in self.chars_at(display_row) {
+        for c in self.chars_at(DisplayPoint::new(display_row, 0)) {
             if c == '\n' || count >= char_count {
                 break;
             }
@@ -237,7 +246,7 @@ impl DisplayMapSnapshot {
     pub fn line_indent(&self, display_row: u32) -> (u32, bool) {
         let mut indent = 0;
         let mut is_blank = true;
-        for c in self.chars_at(display_row) {
+        for c in self.chars_at(DisplayPoint::new(display_row, 0)) {
             if c == ' ' {
                 indent += 1;
             } else {

zed/src/editor/movement.rs 🔗

@@ -120,7 +120,7 @@ pub fn prev_word_boundary(map: &DisplayMapSnapshot, point: DisplayPoint) -> Resu
         let mut boundary = DisplayPoint::new(point.row(), 0);
         let mut column = 0;
         let mut prev_c = None;
-        for c in map.chars_at(point.row()) {
+        for c in map.chars_at(DisplayPoint::new(point.row(), 0)) {
             if column >= point.column() {
                 break;
             }
@@ -141,7 +141,7 @@ pub fn next_word_boundary(
     mut point: DisplayPoint,
 ) -> Result<DisplayPoint> {
     let mut prev_c = None;
-    for c in map.chars_at(point.row()).skip(point.column() as usize) {
+    for c in map.chars_at(point) {
         if prev_c.is_some() && (c == '\n' || char_kind(prev_c.unwrap()) != char_kind(c)) {
             break;
         }
@@ -176,3 +176,61 @@ fn char_kind(c: char) -> CharKind {
         CharKind::Punctuation
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        editor::{display_map::DisplayMap, Buffer},
+        test::build_app_state,
+    };
+
+    #[gpui::test]
+    fn test_prev_next_word_boundary_multibyte(cx: &mut gpui::MutableAppContext) {
+        let settings = build_app_state(cx).settings.borrow().clone();
+        let buffer = cx.add_model(|cx| Buffer::new(0, "a bcΔ defγ", cx));
+        let display_map = cx.add_model(|cx| DisplayMap::new(buffer, settings, None, cx));
+        let snapshot = display_map.update(cx, |map, cx| map.snapshot(cx));
+        assert_eq!(
+            prev_word_boundary(&snapshot, DisplayPoint::new(0, 12)).unwrap(),
+            DisplayPoint::new(0, 7)
+        );
+        assert_eq!(
+            prev_word_boundary(&snapshot, DisplayPoint::new(0, 7)).unwrap(),
+            DisplayPoint::new(0, 6)
+        );
+        assert_eq!(
+            prev_word_boundary(&snapshot, DisplayPoint::new(0, 6)).unwrap(),
+            DisplayPoint::new(0, 2)
+        );
+        assert_eq!(
+            prev_word_boundary(&snapshot, DisplayPoint::new(0, 2)).unwrap(),
+            DisplayPoint::new(0, 1)
+        );
+        assert_eq!(
+            prev_word_boundary(&snapshot, DisplayPoint::new(0, 1)).unwrap(),
+            DisplayPoint::new(0, 0)
+        );
+
+        assert_eq!(
+            next_word_boundary(&snapshot, DisplayPoint::new(0, 0)).unwrap(),
+            DisplayPoint::new(0, 1)
+        );
+        assert_eq!(
+            next_word_boundary(&snapshot, DisplayPoint::new(0, 1)).unwrap(),
+            DisplayPoint::new(0, 2)
+        );
+        assert_eq!(
+            next_word_boundary(&snapshot, DisplayPoint::new(0, 2)).unwrap(),
+            DisplayPoint::new(0, 6)
+        );
+        assert_eq!(
+            next_word_boundary(&snapshot, DisplayPoint::new(0, 6)).unwrap(),
+            DisplayPoint::new(0, 7)
+        );
+        assert_eq!(
+            next_word_boundary(&snapshot, DisplayPoint::new(0, 7)).unwrap(),
+            DisplayPoint::new(0, 12)
+        );
+    }
+}