rope: Micro optimize the creation of masks (#41132)

Adam Richardson created

Using compiler explorer I saw that the compiler wasn't clever enough to
optimise away the branches in the masking code. I thought the compiler
would have a better chance if we always branched, which [turned out to
be the case](https://godbolt.org/z/PM594Pz18).

Running the benchmarks the biggest benefit I saw was:
```
push/65536              time:   [2.9067 ms 2.9243 ms 2.9417 ms]
                        thrpt:  [21.246 MiB/s 21.373 MiB/s 21.502 MiB/s]
                 change:
                        time:   [-8.3452% -7.2617% -6.2009%] (p = 0.00 < 0.05)
                        thrpt:  [+6.6108% +7.8303% +9.1050%]
                        Performance has improved.
```
But I did also see some regressions:
```
slice/4096              time:   [66.195 µs 66.815 µs 67.448 µs]
                        thrpt:  [57.915 MiB/s 58.464 MiB/s 59.012 MiB/s]
                 change:
                        time:   [+3.7131% +5.1698% +6.6971%] (p = 0.00 < 0.05)
                        thrpt:  [-6.2768% -4.9157% -3.5802%]
                        Performance has regressed.
```

Release Notes:

- N/A

Change summary

crates/rope/src/chunk.rs | 35 +++++++++++++++--------------------
1 file changed, 15 insertions(+), 20 deletions(-)

Detailed changes

crates/rope/src/chunk.rs 🔗

@@ -32,6 +32,16 @@ pub struct Chunk {
     pub text: ArrayString<MAX_BASE>,
 }
 
+#[inline(always)]
+const fn saturating_shl_mask(offset: u32) -> Bitmap {
+    (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
+}
+
+#[inline(always)]
+const fn saturating_shr_mask(offset: u32) -> Bitmap {
+    !Bitmap::MAX.unbounded_shr(offset)
+}
+
 impl Chunk {
     pub const MASK_BITS: usize = Bitmap::BITS as usize;
 
@@ -291,34 +301,19 @@ impl<'a> ChunkSlice<'a> {
     /// Get number of chars in first line
     #[inline(always)]
     pub fn first_line_chars(&self) -> u32 {
-        if self.newlines == 0 {
-            self.chars.count_ones()
-        } else {
-            let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
-            (self.chars & mask).count_ones()
-        }
+        (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
     }
 
     /// Get number of chars in last line
     #[inline(always)]
     pub fn last_line_chars(&self) -> u32 {
-        if self.newlines == 0 {
-            self.chars.count_ones()
-        } else {
-            let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
-            (self.chars & mask).count_ones()
-        }
+        (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
     }
 
     /// Get number of UTF-16 code units in last line
     #[inline(always)]
     pub fn last_line_len_utf16(&self) -> u32 {
-        if self.newlines == 0 {
-            self.chars_utf16.count_ones()
-        } else {
-            let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
-            (self.chars_utf16 & mask).count_ones()
-        }
+        (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
     }
 
     /// Get the longest row in the chunk and its length in characters.
@@ -492,8 +487,8 @@ impl<'a> ChunkSlice<'a> {
 
     #[inline(always)]
     pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
-        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
-        let row = (self.newlines & mask).count_ones();
+        let mask = saturating_shl_mask(offset as u32);
+        let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
         let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
         let column = if newline_ix as usize == MAX_BASE {
             0