use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped}; use arrayvec::ArrayString; use std::{cmp, ops::Range}; use sum_tree::Bias; use unicode_segmentation::GraphemeCursor; use util::debug_panic; pub(crate) const MIN_BASE: usize = if cfg!(test) { 6 } else { 64 }; pub(crate) const MAX_BASE: usize = MIN_BASE * 2; #[derive(Clone, Debug, Default)] pub struct Chunk { chars: u128, chars_utf16: u128, newlines: u128, tabs: u128, pub text: ArrayString, } impl Chunk { #[inline(always)] pub fn new(text: &str) -> Self { let mut this = Chunk::default(); this.push_str(text); this } #[inline(always)] pub fn push_str(&mut self, text: &str) { for (char_ix, c) in text.char_indices() { let ix = self.text.len() + char_ix; self.chars |= 1 << ix; self.chars_utf16 |= 1 << ix; self.chars_utf16 |= (c.len_utf16() as u128) << ix; self.newlines |= ((c == '\n') as u128) << ix; self.tabs |= ((c == '\t') as u128) << ix; } self.text.push_str(text); } #[inline(always)] pub fn append(&mut self, slice: ChunkSlice) { if slice.is_empty() { return; }; let base_ix = self.text.len(); self.chars |= slice.chars << base_ix; self.chars_utf16 |= slice.chars_utf16 << base_ix; self.newlines |= slice.newlines << base_ix; self.tabs |= slice.tabs << base_ix; self.text.push_str(&slice.text); } #[inline(always)] pub fn as_slice(&self) -> ChunkSlice { ChunkSlice { chars: self.chars, chars_utf16: self.chars_utf16, newlines: self.newlines, tabs: self.tabs, text: &self.text, } } #[inline(always)] pub fn slice(&self, range: Range) -> ChunkSlice { self.as_slice().slice(range) } } #[derive(Clone, Copy, Debug)] pub struct ChunkSlice<'a> { chars: u128, chars_utf16: u128, newlines: u128, tabs: u128, text: &'a str, } impl<'a> Into for ChunkSlice<'a> { fn into(self) -> Chunk { Chunk { chars: self.chars, chars_utf16: self.chars_utf16, newlines: self.newlines, tabs: self.tabs, text: self.text.try_into().unwrap(), } } } impl<'a> ChunkSlice<'a> { #[inline(always)] pub fn is_empty(self) -> bool { self.text.is_empty() } #[inline(always)] pub fn is_char_boundary(self, offset: usize) -> bool { self.text.is_char_boundary(offset) } #[inline(always)] pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) { if mid == MAX_BASE { let left = self; let right = ChunkSlice { chars: 0, chars_utf16: 0, newlines: 0, tabs: 0, text: "", }; (left, right) } else { let mask = (1u128 << mid) - 1; let (left_text, right_text) = self.text.split_at(mid); let left = ChunkSlice { chars: self.chars & mask, chars_utf16: self.chars_utf16 & mask, newlines: self.newlines & mask, tabs: self.tabs & mask, text: left_text, }; let right = ChunkSlice { chars: self.chars >> mid, chars_utf16: self.chars_utf16 >> mid, newlines: self.newlines >> mid, tabs: self.tabs >> mid, text: right_text, }; (left, right) } } #[inline(always)] pub fn slice(self, range: Range) -> Self { let mask = if range.end == MAX_BASE { u128::MAX } else { (1u128 << range.end) - 1 }; if range.start == MAX_BASE { Self { chars: 0, chars_utf16: 0, newlines: 0, tabs: 0, text: "", } } else { Self { chars: (self.chars & mask) >> range.start, chars_utf16: (self.chars_utf16 & mask) >> range.start, newlines: (self.newlines & mask) >> range.start, tabs: (self.tabs & mask) >> range.start, text: &self.text[range], } } } #[inline(always)] pub fn text_summary(&self) -> TextSummary { let (longest_row, longest_row_chars) = self.longest_row(); TextSummary { len: self.len(), len_utf16: self.len_utf16(), lines: self.lines(), first_line_chars: self.first_line_chars(), last_line_chars: self.last_line_chars(), last_line_len_utf16: self.last_line_len_utf16(), longest_row, longest_row_chars, } } /// Get length in bytes #[inline(always)] pub fn len(&self) -> usize { self.text.len() } /// Get length in UTF-16 code units #[inline(always)] pub fn len_utf16(&self) -> OffsetUtf16 { OffsetUtf16(self.chars_utf16.count_ones() as usize) } /// Get point representing number of lines and length of last line #[inline(always)] pub fn lines(&self) -> Point { let row = self.newlines.count_ones(); let column = self.newlines.leading_zeros() - (u128::BITS - self.text.len() as u32); Point::new(row, column) } /// Get number of chars in first line #[inline(always)] pub fn first_line_chars(&self) -> u32 { if self.newlines == 0 { self.chars.count_ones() } else { let mask = (1u128 << self.newlines.trailing_zeros()) - 1; (self.chars & mask).count_ones() } } /// Get number of chars in last line #[inline(always)] pub fn last_line_chars(&self) -> u32 { if self.newlines == 0 { self.chars.count_ones() } else { let mask = !(u128::MAX >> self.newlines.leading_zeros()); (self.chars & mask).count_ones() } } /// Get number of UTF-16 code units in last line #[inline(always)] pub fn last_line_len_utf16(&self) -> u32 { if self.newlines == 0 { self.chars_utf16.count_ones() } else { let mask = !(u128::MAX >> self.newlines.leading_zeros()); (self.chars_utf16 & mask).count_ones() } } /// Get the longest row in the chunk and its length in characters. #[inline(always)] pub fn longest_row(&self) -> (u32, u32) { let mut chars = self.chars; let mut newlines = self.newlines; let mut row = 0; let mut longest_row = 0; let mut longest_row_chars = 0; while newlines > 0 { let newline_ix = newlines.trailing_zeros(); let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8; if row_chars > longest_row_chars { longest_row = row; longest_row_chars = row_chars; } newlines >>= newline_ix; newlines >>= 1; chars >>= newline_ix; chars >>= 1; row += 1; } let row_chars = chars.count_ones() as u8; if row_chars > longest_row_chars { (row, row_chars as u32) } else { (longest_row, longest_row_chars as u32) } } #[inline(always)] pub fn offset_to_point(&self, offset: usize) -> Point { let mask = if offset == MAX_BASE { u128::MAX } else { (1u128 << offset) - 1 }; let row = (self.newlines & mask).count_ones(); let newline_ix = u128::BITS - (self.newlines & mask).leading_zeros(); let column = (offset - newline_ix as usize) as u32; Point::new(row, column) } #[inline(always)] pub fn point_to_offset(&self, point: Point) -> usize { if point.row > self.lines().row { debug_panic!( "point {:?} extends beyond rows for string {:?}", point, self.text ); return self.len(); } let row_offset_range = self.offset_range_for_row(point.row); if point.column > row_offset_range.len() as u32 { debug_panic!( "point {:?} extends beyond row for string {:?}", point, self.text ); row_offset_range.end } else { row_offset_range.start + point.column as usize } } #[inline(always)] pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 { let mask = if offset == MAX_BASE { u128::MAX } else { (1u128 << offset) - 1 }; OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize) } #[inline(always)] pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize { if target.0 == 0 { 0 } else { let ix = nth_set_bit(self.chars_utf16, target.0) + 1; if ix == MAX_BASE { MAX_BASE } else { let utf8_additional_len = cmp::min( (self.chars_utf16 >> ix).trailing_zeros() as usize, self.text.len() - ix, ); ix + utf8_additional_len } } } #[inline(always)] pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 { let mask = if offset == MAX_BASE { u128::MAX } else { (1u128 << offset) - 1 }; let row = (self.newlines & mask).count_ones(); let newline_ix = u128::BITS - (self.newlines & mask).leading_zeros(); let column = if newline_ix as usize == MAX_BASE { 0 } else { ((self.chars_utf16 & mask) >> newline_ix).count_ones() }; PointUtf16::new(row, column) } #[inline(always)] pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 { self.offset_to_point_utf16(self.point_to_offset(point)) } #[inline(always)] pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize { let lines = self.lines(); if point.row > lines.row { if !clip { debug_panic!( "point {:?} is beyond this chunk's extent {:?}", point, self.text ); } return self.len(); } let row_offset_range = self.offset_range_for_row(point.row); let line = self.slice(row_offset_range.clone()); if point.column > line.last_line_len_utf16() { if !clip { debug_panic!( "point {:?} is beyond the end of the line in chunk {:?}", point, self.text ); } return line.len(); } let mut offset = row_offset_range.start; if point.column > 0 { offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize)); if !self.text.is_char_boundary(offset) { offset -= 1; while !self.text.is_char_boundary(offset) { offset -= 1; } if !clip { debug_panic!( "point {:?} is within character in chunk {:?}", point, self.text, ); } } } offset } #[inline(always)] pub fn unclipped_point_utf16_to_point(&self, point: Unclipped) -> Point { let max_point = self.lines(); if point.0.row > max_point.row { return max_point; } let row_offset_range = self.offset_range_for_row(point.0.row); let line = self.slice(row_offset_range.clone()); if point.0.column == 0 { Point::new(point.0.row, 0) } else if point.0.column >= line.len_utf16().0 as u32 { Point::new(point.0.row, line.len() as u32) } else { let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize)); while !line.text.is_char_boundary(column) { column -= 1; } Point::new(point.0.row, column as u32) } } #[inline(always)] pub fn clip_point(&self, point: Point, bias: Bias) -> Point { let max_point = self.lines(); if point.row > max_point.row { return max_point; } let line = self.slice(self.offset_range_for_row(point.row)); if point.column == 0 { point } else if point.column >= line.len() as u32 { Point::new(point.row, line.len() as u32) } else { let mut column = point.column as usize; let bytes = line.text.as_bytes(); if bytes[column - 1] < 128 && bytes[column] < 128 { return Point::new(point.row, column as u32); } let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true); loop { if line.is_char_boundary(column) && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false) { break; } match bias { Bias::Left => column -= 1, Bias::Right => column += 1, } grapheme_cursor.set_cursor(column); } Point::new(point.row, column as u32) } } #[inline(always)] pub fn clip_point_utf16(&self, point: Unclipped, bias: Bias) -> PointUtf16 { let max_point = self.lines(); if point.0.row > max_point.row { PointUtf16::new(max_point.row, self.last_line_len_utf16()) } else { let line = self.slice(self.offset_range_for_row(point.0.row)); let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias); PointUtf16::new(point.0.row, column.0 as u32) } } #[inline(always)] pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 { if target == OffsetUtf16::default() { OffsetUtf16::default() } else if target >= self.len_utf16() { self.len_utf16() } else { let mut offset = self.offset_utf16_to_offset(target); while !self.text.is_char_boundary(offset) { if bias == Bias::Left { offset -= 1; } else { offset += 1; } } self.offset_to_offset_utf16(offset) } } #[inline(always)] fn offset_range_for_row(&self, row: u32) -> Range { let row_start = if row > 0 { nth_set_bit(self.newlines, row as usize) + 1 } else { 0 }; let row_len = if row_start == MAX_BASE { 0 } else { cmp::min( (self.newlines >> row_start).trailing_zeros(), (self.text.len() - row_start) as u32, ) }; row_start..row_start + row_len as usize } #[inline(always)] pub fn tabs(&self) -> Tabs { Tabs { tabs: self.tabs, chars: self.chars, } } } pub struct Tabs { tabs: u128, chars: u128, } #[derive(Debug, PartialEq, Eq)] pub struct TabPosition { pub byte_offset: usize, pub char_offset: usize, } impl Iterator for Tabs { type Item = TabPosition; fn next(&mut self) -> Option { if self.tabs == 0 { return None; } let tab_offset = self.tabs.trailing_zeros() as usize; let chars_mask = (1 << tab_offset) - 1; let char_offset = (self.chars & chars_mask).count_ones() as usize; // Since tabs are 1 byte the tab offset is the same as the byte offset let position = TabPosition { byte_offset: tab_offset, char_offset: char_offset, }; // Remove the tab we've just seen self.tabs ^= 1 << tab_offset; Some(position) } } /// Finds the n-th bit that is set to 1. #[inline(always)] fn nth_set_bit(v: u128, n: usize) -> usize { let low = v as u64; let high = (v >> 64) as u64; let low_count = low.count_ones() as usize; if n > low_count { 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize } else { nth_set_bit_u64(low, n as u64) as usize } } #[inline(always)] fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 { let v = v.reverse_bits(); let mut s: u64 = 64; // Parallel bit count intermediates let a = v - ((v >> 1) & (u64::MAX / 3)); let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5)); let c = (b + (b >> 4)) & (u64::MAX / 0x11); let d = (c + (c >> 8)) & (u64::MAX / 0x101); // Branchless select let t = (d >> 32) + (d >> 48); s -= (t.wrapping_sub(n) & 256) >> 3; n -= t & (t.wrapping_sub(n) >> 8); let t = (d >> (s - 16)) & 0xff; s -= (t.wrapping_sub(n) & 256) >> 4; n -= t & (t.wrapping_sub(n) >> 8); let t = (c >> (s - 8)) & 0xf; s -= (t.wrapping_sub(n) & 256) >> 5; n -= t & (t.wrapping_sub(n) >> 8); let t = (b >> (s - 4)) & 0x7; s -= (t.wrapping_sub(n) & 256) >> 6; n -= t & (t.wrapping_sub(n) >> 8); let t = (a >> (s - 2)) & 0x3; s -= (t.wrapping_sub(n) & 256) >> 7; n -= t & (t.wrapping_sub(n) >> 8); let t = (v >> (s - 1)) & 0x1; s -= (t.wrapping_sub(n) & 256) >> 8; 65 - s - 1 } #[cfg(test)] mod tests { use super::*; use rand::prelude::*; use util::RandomCharIter; #[gpui::test(iterations = 100)] fn test_random_chunks(mut rng: StdRng) { let chunk_len = rng.gen_range(0..=MAX_BASE); let text = RandomCharIter::new(&mut rng) .take(chunk_len) .collect::(); let mut ix = chunk_len; while !text.is_char_boundary(ix) { ix -= 1; } let text = &text[..ix]; log::info!("Chunk: {:?}", text); let chunk = Chunk::new(&text); verify_chunk(chunk.as_slice(), text); for _ in 0..10 { let mut start = rng.gen_range(0..=chunk.text.len()); let mut end = rng.gen_range(start..=chunk.text.len()); while !chunk.text.is_char_boundary(start) { start -= 1; } while !chunk.text.is_char_boundary(end) { end -= 1; } let range = start..end; log::info!("Range: {:?}", range); let text_slice = &text[range.clone()]; let chunk_slice = chunk.slice(range); verify_chunk(chunk_slice, text_slice); } } #[gpui::test(iterations = 1000)] fn test_nth_set_bit_random(mut rng: StdRng) { let set_count = rng.gen_range(0..=128); let mut set_bits = (0..128).choose_multiple(&mut rng, set_count); set_bits.sort(); let mut n = 0; for ix in set_bits.iter().copied() { n |= 1 << ix; } for (mut ix, position) in set_bits.into_iter().enumerate() { ix += 1; assert_eq!( nth_set_bit(n, ix), position, "nth_set_bit({:0128b}, {})", n, ix ); } } fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) { let mut offset = 0; let mut offset_utf16 = OffsetUtf16(0); let mut point = Point::zero(); let mut point_utf16 = PointUtf16::zero(); log::info!("Verifying chunk {:?}", text); assert_eq!(chunk.offset_to_point(0), Point::zero()); let mut expected_tab_positions = Vec::new(); for (char_offset, c) in text.chars().enumerate() { let expected_point = chunk.offset_to_point(offset); assert_eq!(point, expected_point, "mismatch at offset {}", offset); assert_eq!( chunk.point_to_offset(point), offset, "mismatch at point {:?}", point ); assert_eq!( chunk.offset_to_offset_utf16(offset), offset_utf16, "mismatch at offset {}", offset ); assert_eq!( chunk.offset_utf16_to_offset(offset_utf16), offset, "mismatch at offset_utf16 {:?}", offset_utf16 ); assert_eq!( chunk.point_to_point_utf16(point), point_utf16, "mismatch at point {:?}", point ); assert_eq!( chunk.point_utf16_to_offset(point_utf16, false), offset, "mismatch at point_utf16 {:?}", point_utf16 ); assert_eq!( chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)), point, "mismatch for unclipped_point_utf16_to_point at {:?}", point_utf16 ); assert_eq!( chunk.clip_point(point, Bias::Left), point, "incorrect left clip at {:?}", point ); assert_eq!( chunk.clip_point(point, Bias::Right), point, "incorrect right clip at {:?}", point ); for i in 1..c.len_utf8() { let test_point = Point::new(point.row, point.column + i as u32); assert_eq!( chunk.clip_point(test_point, Bias::Left), point, "incorrect left clip within multi-byte char at {:?}", test_point ); assert_eq!( chunk.clip_point(test_point, Bias::Right), Point::new(point.row, point.column + c.len_utf8() as u32), "incorrect right clip within multi-byte char at {:?}", test_point ); } for i in 1..c.len_utf16() { let test_point = Unclipped(PointUtf16::new( point_utf16.row, point_utf16.column + i as u32, )); assert_eq!( chunk.unclipped_point_utf16_to_point(test_point), point, "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}", test_point ); assert_eq!( chunk.clip_point_utf16(test_point, Bias::Left), point_utf16, "incorrect left clip_point_utf16 within multi-byte char at {:?}", test_point ); assert_eq!( chunk.clip_point_utf16(test_point, Bias::Right), PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32), "incorrect right clip_point_utf16 within multi-byte char at {:?}", test_point ); let test_offset = OffsetUtf16(offset_utf16.0 + i); assert_eq!( chunk.clip_offset_utf16(test_offset, Bias::Left), offset_utf16, "incorrect left clip_offset_utf16 within multi-byte char at {:?}", test_offset ); assert_eq!( chunk.clip_offset_utf16(test_offset, Bias::Right), OffsetUtf16(offset_utf16.0 + c.len_utf16()), "incorrect right clip_offset_utf16 within multi-byte char at {:?}", test_offset ); } if c == '\n' { point.row += 1; point.column = 0; point_utf16.row += 1; point_utf16.column = 0; } else { point.column += c.len_utf8() as u32; point_utf16.column += c.len_utf16() as u32; } if c == '\t' { expected_tab_positions.push(TabPosition { byte_offset: offset, char_offset, }); } offset += c.len_utf8(); offset_utf16.0 += c.len_utf16(); } let final_point = chunk.offset_to_point(offset); assert_eq!(point, final_point, "mismatch at final offset {}", offset); assert_eq!( chunk.point_to_offset(point), offset, "mismatch at point {:?}", point ); assert_eq!( chunk.offset_to_offset_utf16(offset), offset_utf16, "mismatch at offset {}", offset ); assert_eq!( chunk.offset_utf16_to_offset(offset_utf16), offset, "mismatch at offset_utf16 {:?}", offset_utf16 ); assert_eq!( chunk.point_to_point_utf16(point), point_utf16, "mismatch at final point {:?}", point ); assert_eq!( chunk.point_utf16_to_offset(point_utf16, false), offset, "mismatch at final point_utf16 {:?}", point_utf16 ); assert_eq!( chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)), point, "mismatch for unclipped_point_utf16_to_point at final point {:?}", point_utf16 ); assert_eq!( chunk.clip_point(point, Bias::Left), point, "incorrect left clip at final point {:?}", point ); assert_eq!( chunk.clip_point(point, Bias::Right), point, "incorrect right clip at final point {:?}", point ); assert_eq!( chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left), point_utf16, "incorrect left clip_point_utf16 at final point {:?}", point_utf16 ); assert_eq!( chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right), point_utf16, "incorrect right clip_point_utf16 at final point {:?}", point_utf16 ); assert_eq!( chunk.clip_offset_utf16(offset_utf16, Bias::Left), offset_utf16, "incorrect left clip_offset_utf16 at final offset {:?}", offset_utf16 ); assert_eq!( chunk.clip_offset_utf16(offset_utf16, Bias::Right), offset_utf16, "incorrect right clip_offset_utf16 at final offset {:?}", offset_utf16 ); // Verify length methods assert_eq!(chunk.len(), text.len()); assert_eq!( chunk.len_utf16().0, text.chars().map(|c| c.len_utf16()).sum::() ); // Verify line counting let lines = chunk.lines(); let mut newline_count = 0; let mut last_line_len = 0; for c in text.chars() { if c == '\n' { newline_count += 1; last_line_len = 0; } else { last_line_len += c.len_utf8() as u32; } } assert_eq!(lines, Point::new(newline_count, last_line_len)); // Verify first/last line chars if !text.is_empty() { let first_line = text.split('\n').next().unwrap(); assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32); let last_line = text.split('\n').last().unwrap(); assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32); assert_eq!( chunk.last_line_len_utf16(), last_line.chars().map(|c| c.len_utf16() as u32).sum::() ); } // Verify longest row let (longest_row, longest_chars) = chunk.longest_row(); let mut max_chars = 0; let mut current_row = 0; let mut current_chars = 0; let mut max_row = 0; for c in text.chars() { if c == '\n' { if current_chars > max_chars { max_chars = current_chars; max_row = current_row; } current_row += 1; current_chars = 0; } else { current_chars += 1; } } if current_chars > max_chars { max_chars = current_chars; max_row = current_row; } assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars)); assert_eq!(chunk.tabs().collect::>(), expected_tab_positions); } }