Replace {floor/ceil}_char_boundary polyfills with std (#42599)

Lukas Wirth created

Release Notes:

- N/A *or* Added/Fixed/Improved ...

Change summary

crates/editor/src/display_map/inlay_map.rs | 33 --------
crates/rope/src/chunk.rs                   | 28 -------
crates/rope/src/rope.rs                    | 85 ++---------------------
crates/util/src/util.rs                    |  6 +
4 files changed, 18 insertions(+), 134 deletions(-)

Detailed changes

crates/editor/src/display_map/inlay_map.rs šŸ”—

@@ -248,10 +248,8 @@ impl<'a> Iterator for InlayChunks<'a> {
                 // Determine split index handling edge cases
                 let split_index = if desired_bytes >= chunk.text.len() {
                     chunk.text.len()
-                } else if chunk.text.is_char_boundary(desired_bytes) {
-                    desired_bytes
                 } else {
-                    find_next_utf8_boundary(chunk.text, desired_bytes)
+                    chunk.text.ceil_char_boundary(desired_bytes)
                 };
 
                 let (prefix, suffix) = chunk.text.split_at(split_index);
@@ -373,10 +371,8 @@ impl<'a> Iterator for InlayChunks<'a> {
                         .next()
                         .map(|c| c.len_utf8())
                         .unwrap_or(1)
-                } else if inlay_chunk.is_char_boundary(next_inlay_highlight_endpoint) {
-                    next_inlay_highlight_endpoint
                 } else {
-                    find_next_utf8_boundary(inlay_chunk, next_inlay_highlight_endpoint)
+                    inlay_chunk.ceil_char_boundary(next_inlay_highlight_endpoint)
                 };
 
                 let (chunk, remainder) = inlay_chunk.split_at(split_index);
@@ -1146,31 +1142,6 @@ fn push_isomorphic(sum_tree: &mut SumTree<Transform>, summary: TextSummary) {
     }
 }
 
-/// Given a byte index that is NOT a UTF-8 boundary, find the next one.
-/// Assumes: 0 < byte_index < text.len() and !text.is_char_boundary(byte_index)
-#[inline(always)]
-fn find_next_utf8_boundary(text: &str, byte_index: usize) -> usize {
-    let bytes = text.as_bytes();
-    let mut idx = byte_index + 1;
-
-    // Scan forward until we find a boundary
-    while idx < text.len() {
-        if is_utf8_char_boundary(bytes[idx]) {
-            return idx;
-        }
-        idx += 1;
-    }
-
-    // Hit the end, return the full length
-    text.len()
-}
-
-// Private helper function taken from Rust's core::num module (which is both Apache2 and MIT licensed)
-const fn is_utf8_char_boundary(byte: u8) -> bool {
-    // This is bit magic equivalent to: b < 128 || b >= 192
-    (byte as i8) >= -0x40
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;

crates/rope/src/chunk.rs šŸ”—

@@ -110,18 +110,12 @@ impl Chunk {
     }
 
     pub fn floor_char_boundary(&self, index: usize) -> usize {
-        #[inline]
-        pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
-            // This is bit magic equivalent to: b < 128 || b >= 192
-            (u8 as i8) >= -0x40
-        }
-
         if index >= self.text.len() {
             self.text.len()
         } else {
             let mut i = index;
             while i > 0 {
-                if is_utf8_char_boundary(self.text.as_bytes()[i]) {
+                if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
                     break;
                 }
                 i -= 1;
@@ -423,25 +417,7 @@ impl<'a> ChunkSlice<'a> {
     }
 
     pub fn floor_char_boundary(&self, index: usize) -> usize {
-        #[inline]
-        pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
-            // This is bit magic equivalent to: b < 128 || b >= 192
-            (u8 as i8) >= -0x40
-        }
-
-        if index >= self.text.len() {
-            self.text.len()
-        } else {
-            let mut i = index;
-            while i > 0 {
-                if is_utf8_char_boundary(self.text.as_bytes()[i]) {
-                    break;
-                }
-                i -= 1;
-            }
-
-            i
-        }
+        self.text.floor_char_boundary(index)
     }
 
     #[inline(always)]

crates/rope/src/rope.rs šŸ”—

@@ -74,29 +74,9 @@ impl Rope {
         if index >= self.len() {
             self.len()
         } else {
-            #[inline]
-            pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
-                // This is bit magic equivalent to: b < 128 || b >= 192
-                (u8 as i8) >= -0x40
-            }
-
             let (start, _, item) = self.chunks.find::<usize, _>((), &index, Bias::Left);
             let chunk_offset = index - start;
-            let lower_idx = item.map(|chunk| {
-                let lower_bound = chunk_offset.saturating_sub(3);
-                chunk
-                    .text
-                    .as_bytes()
-                    .get(lower_bound..=chunk_offset)
-                    .map(|it| {
-                        let new_idx = it
-                            .iter()
-                            .rposition(|&b| is_utf8_char_boundary(b))
-                            .unwrap_or(0);
-                        lower_bound + new_idx
-                    })
-                    .unwrap_or(chunk.text.len())
-            });
+            let lower_idx = item.map(|chunk| chunk.text.floor_char_boundary(chunk_offset));
             lower_idx.map_or_else(|| self.len(), |idx| start + idx)
         }
     }
@@ -105,22 +85,9 @@ impl Rope {
         if index > self.len() {
             self.len()
         } else {
-            #[inline]
-            pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
-                // This is bit magic equivalent to: b < 128 || b >= 192
-                (u8 as i8) >= -0x40
-            }
-
             let (start, _, item) = self.chunks.find::<usize, _>((), &index, Bias::Left);
             let chunk_offset = index - start;
-            let upper_idx = item.map(|chunk| {
-                let upper_bound = Ord::min(chunk_offset + 4, chunk.text.len());
-                chunk.text.as_bytes()[chunk_offset..upper_bound]
-                    .iter()
-                    .position(|&b| is_utf8_char_boundary(b))
-                    .map_or(upper_bound, |pos| pos + chunk_offset)
-            });
-
+            let upper_idx = item.map(|chunk| chunk.text.ceil_char_boundary(chunk_offset));
             upper_idx.map_or_else(|| self.len(), |idx| start + idx)
         }
     }
@@ -2186,79 +2153,43 @@ mod tests {
 
     #[test]
     fn test_floor_char_boundary() {
-        // polyfill of str::floor_char_boundary
-        fn floor_char_boundary(str: &str, index: usize) -> usize {
-            if index >= str.len() {
-                str.len()
-            } else {
-                let lower_bound = index.saturating_sub(3);
-                let new_index = str.as_bytes()[lower_bound..=index]
-                    .iter()
-                    .rposition(|b| (*b as i8) >= -0x40);
-
-                lower_bound + new_index.unwrap()
-            }
-        }
-
         let fixture = "地";
         let rope = Rope::from("地");
         for b in 0..=fixture.len() {
-            assert_eq!(
-                rope.floor_char_boundary(b),
-                floor_char_boundary(&fixture, b)
-            );
+            assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b));
         }
 
         let fixture = "";
         let rope = Rope::from("");
         for b in 0..=fixture.len() {
-            assert_eq!(
-                rope.floor_char_boundary(b),
-                floor_char_boundary(&fixture, b)
-            );
+            assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b));
         }
 
         let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©";
         let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©");
         for b in 0..=fixture.len() {
-            assert_eq!(
-                rope.floor_char_boundary(b),
-                floor_char_boundary(&fixture, b)
-            );
+            assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b));
         }
     }
 
     #[test]
     fn test_ceil_char_boundary() {
-        // polyfill of str::ceil_char_boundary
-        fn ceil_char_boundary(str: &str, index: usize) -> usize {
-            if index > str.len() {
-                str.len()
-            } else {
-                let upper_bound = Ord::min(index + 4, str.len());
-                str.as_bytes()[index..upper_bound]
-                    .iter()
-                    .position(|b| (*b as i8) >= -0x40)
-                    .map_or(upper_bound, |pos| pos + index)
-            }
-        }
-
         let fixture = "地";
         let rope = Rope::from("地");
         for b in 0..=fixture.len() {
-            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+            assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b));
         }
 
         let fixture = "";
         let rope = Rope::from("");
         for b in 0..=fixture.len() {
-            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+            assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b));
         }
 
         let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©";
         let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©");
         for b in 0..=fixture.len() {
-            assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b));
+            assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b));
         }
     }
 

crates/util/src/util.rs šŸ”—

@@ -51,6 +51,12 @@ macro_rules! debug_panic {
     };
 }
 
+#[inline]
+pub const fn is_utf8_char_boundary(u8: u8) -> bool {
+    // This is bit magic equivalent to: b < 128 || b >= 192
+    (u8 as i8) >= -0x40
+}
+
 pub fn truncate(s: &str, max_chars: usize) -> &str {
     match s.char_indices().nth(max_chars) {
         None => s,