From b92b28314f53a08fa05374e0c7dd35de88a25dd0 Mon Sep 17 00:00:00 2001 From: Lukas Wirth Date: Thu, 13 Nov 2025 09:11:18 +0100 Subject: [PATCH] Replace {floor/ceil}_char_boundary polyfills with std (#42599) Release Notes: - N/A *or* Added/Fixed/Improved ... --- crates/editor/src/display_map/inlay_map.rs | 33 +-------- crates/rope/src/chunk.rs | 28 +------ crates/rope/src/rope.rs | 85 ++-------------------- crates/util/src/util.rs | 6 ++ 4 files changed, 18 insertions(+), 134 deletions(-) diff --git a/crates/editor/src/display_map/inlay_map.rs b/crates/editor/src/display_map/inlay_map.rs index 697cf1f68ceac4f6a777a3ec401658394f646af5..f3f3a3eee8ea6d1f95261ae4d313afb6f4d497e3 100644 --- a/crates/editor/src/display_map/inlay_map.rs +++ b/crates/editor/src/display_map/inlay_map.rs @@ -248,10 +248,8 @@ impl<'a> Iterator for InlayChunks<'a> { // Determine split index handling edge cases let split_index = if desired_bytes >= chunk.text.len() { chunk.text.len() - } else if chunk.text.is_char_boundary(desired_bytes) { - desired_bytes } else { - find_next_utf8_boundary(chunk.text, desired_bytes) + chunk.text.ceil_char_boundary(desired_bytes) }; let (prefix, suffix) = chunk.text.split_at(split_index); @@ -373,10 +371,8 @@ impl<'a> Iterator for InlayChunks<'a> { .next() .map(|c| c.len_utf8()) .unwrap_or(1) - } else if inlay_chunk.is_char_boundary(next_inlay_highlight_endpoint) { - next_inlay_highlight_endpoint } else { - find_next_utf8_boundary(inlay_chunk, next_inlay_highlight_endpoint) + inlay_chunk.ceil_char_boundary(next_inlay_highlight_endpoint) }; let (chunk, remainder) = inlay_chunk.split_at(split_index); @@ -1146,31 +1142,6 @@ fn push_isomorphic(sum_tree: &mut SumTree, summary: TextSummary) { } } -/// Given a byte index that is NOT a UTF-8 boundary, find the next one. -/// Assumes: 0 < byte_index < text.len() and !text.is_char_boundary(byte_index) -#[inline(always)] -fn find_next_utf8_boundary(text: &str, byte_index: usize) -> usize { - let bytes = text.as_bytes(); - let mut idx = byte_index + 1; - - // Scan forward until we find a boundary - while idx < text.len() { - if is_utf8_char_boundary(bytes[idx]) { - return idx; - } - idx += 1; - } - - // Hit the end, return the full length - text.len() -} - -// Private helper function taken from Rust's core::num module (which is both Apache2 and MIT licensed) -const fn is_utf8_char_boundary(byte: u8) -> bool { - // This is bit magic equivalent to: b < 128 || b >= 192 - (byte as i8) >= -0x40 -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/rope/src/chunk.rs b/crates/rope/src/chunk.rs index 4c1e4cd68560f15274722ff1d8249205300c4e68..7ada5c2052481408bc5af56740f8e35916623f14 100644 --- a/crates/rope/src/chunk.rs +++ b/crates/rope/src/chunk.rs @@ -110,18 +110,12 @@ impl Chunk { } pub fn floor_char_boundary(&self, index: usize) -> usize { - #[inline] - pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool { - // This is bit magic equivalent to: b < 128 || b >= 192 - (u8 as i8) >= -0x40 - } - if index >= self.text.len() { self.text.len() } else { let mut i = index; while i > 0 { - if is_utf8_char_boundary(self.text.as_bytes()[i]) { + if util::is_utf8_char_boundary(self.text.as_bytes()[i]) { break; } i -= 1; @@ -423,25 +417,7 @@ impl<'a> ChunkSlice<'a> { } pub fn floor_char_boundary(&self, index: usize) -> usize { - #[inline] - pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool { - // This is bit magic equivalent to: b < 128 || b >= 192 - (u8 as i8) >= -0x40 - } - - if index >= self.text.len() { - self.text.len() - } else { - let mut i = index; - while i > 0 { - if is_utf8_char_boundary(self.text.as_bytes()[i]) { - break; - } - i -= 1; - } - - i - } + self.text.floor_char_boundary(index) } #[inline(always)] diff --git a/crates/rope/src/rope.rs b/crates/rope/src/rope.rs index 394e6ef0ca589d19ffcf7cf07a92bcd15c8e4a18..a5699554a32b552e395001ded24512e10d645d4b 100644 --- a/crates/rope/src/rope.rs +++ b/crates/rope/src/rope.rs @@ -74,29 +74,9 @@ impl Rope { if index >= self.len() { self.len() } else { - #[inline] - pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool { - // This is bit magic equivalent to: b < 128 || b >= 192 - (u8 as i8) >= -0x40 - } - let (start, _, item) = self.chunks.find::((), &index, Bias::Left); let chunk_offset = index - start; - let lower_idx = item.map(|chunk| { - let lower_bound = chunk_offset.saturating_sub(3); - chunk - .text - .as_bytes() - .get(lower_bound..=chunk_offset) - .map(|it| { - let new_idx = it - .iter() - .rposition(|&b| is_utf8_char_boundary(b)) - .unwrap_or(0); - lower_bound + new_idx - }) - .unwrap_or(chunk.text.len()) - }); + let lower_idx = item.map(|chunk| chunk.text.floor_char_boundary(chunk_offset)); lower_idx.map_or_else(|| self.len(), |idx| start + idx) } } @@ -105,22 +85,9 @@ impl Rope { if index > self.len() { self.len() } else { - #[inline] - pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool { - // This is bit magic equivalent to: b < 128 || b >= 192 - (u8 as i8) >= -0x40 - } - let (start, _, item) = self.chunks.find::((), &index, Bias::Left); let chunk_offset = index - start; - let upper_idx = item.map(|chunk| { - let upper_bound = Ord::min(chunk_offset + 4, chunk.text.len()); - chunk.text.as_bytes()[chunk_offset..upper_bound] - .iter() - .position(|&b| is_utf8_char_boundary(b)) - .map_or(upper_bound, |pos| pos + chunk_offset) - }); - + let upper_idx = item.map(|chunk| chunk.text.ceil_char_boundary(chunk_offset)); upper_idx.map_or_else(|| self.len(), |idx| start + idx) } } @@ -2186,79 +2153,43 @@ mod tests { #[test] fn test_floor_char_boundary() { - // polyfill of str::floor_char_boundary - fn floor_char_boundary(str: &str, index: usize) -> usize { - if index >= str.len() { - str.len() - } else { - let lower_bound = index.saturating_sub(3); - let new_index = str.as_bytes()[lower_bound..=index] - .iter() - .rposition(|b| (*b as i8) >= -0x40); - - lower_bound + new_index.unwrap() - } - } - let fixture = "地"; let rope = Rope::from("地"); for b in 0..=fixture.len() { - assert_eq!( - rope.floor_char_boundary(b), - floor_char_boundary(&fixture, b) - ); + assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b)); } let fixture = ""; let rope = Rope::from(""); for b in 0..=fixture.len() { - assert_eq!( - rope.floor_char_boundary(b), - floor_char_boundary(&fixture, b) - ); + assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b)); } let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©"; let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©"); for b in 0..=fixture.len() { - assert_eq!( - rope.floor_char_boundary(b), - floor_char_boundary(&fixture, b) - ); + assert_eq!(rope.floor_char_boundary(b), fixture.floor_char_boundary(b)); } } #[test] fn test_ceil_char_boundary() { - // polyfill of str::ceil_char_boundary - fn ceil_char_boundary(str: &str, index: usize) -> usize { - if index > str.len() { - str.len() - } else { - let upper_bound = Ord::min(index + 4, str.len()); - str.as_bytes()[index..upper_bound] - .iter() - .position(|b| (*b as i8) >= -0x40) - .map_or(upper_bound, |pos| pos + index) - } - } - let fixture = "地"; let rope = Rope::from("地"); for b in 0..=fixture.len() { - assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b)); + assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b)); } let fixture = ""; let rope = Rope::from(""); for b in 0..=fixture.len() { - assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b)); + assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b)); } let fixture = "šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©"; let rope = Rope::from("šŸ”“šŸŸ šŸŸ”šŸŸ¢šŸ”µšŸŸ£āš«ļøāšŖļøšŸŸ¤\nšŸ³ļøā€āš§ļøšŸšŸ³ļøā€šŸŒˆšŸ“ā€ā˜ ļøā›³ļøšŸ“¬šŸ“­šŸ“šŸ³ļøšŸš©"); for b in 0..=fixture.len() { - assert_eq!(rope.ceil_char_boundary(b), ceil_char_boundary(&fixture, b)); + assert_eq!(rope.ceil_char_boundary(b), fixture.ceil_char_boundary(b)); } } diff --git a/crates/util/src/util.rs b/crates/util/src/util.rs index 211b972e69deb9edf5c045a8fc2d52f5b8115bb2..169da43b5282456ab4b056149bcaf3dbda5b4534 100644 --- a/crates/util/src/util.rs +++ b/crates/util/src/util.rs @@ -51,6 +51,12 @@ macro_rules! debug_panic { }; } +#[inline] +pub const fn is_utf8_char_boundary(u8: u8) -> bool { + // This is bit magic equivalent to: b < 128 || b >= 192 + (u8 as i8) >= -0x40 +} + pub fn truncate(s: &str, max_chars: usize) -> &str { match s.char_indices().nth(max_chars) { None => s,