chunk.rs

   1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
   2use arrayvec::ArrayString;
   3use std::{cmp, ops::Range};
   4use sum_tree::Bias;
   5use unicode_segmentation::GraphemeCursor;
   6use util::debug_panic;
   7
   8#[cfg(not(all(test, not(rust_analyzer))))]
   9pub(crate) type Bitmap = u128;
  10#[cfg(all(test, not(rust_analyzer)))]
  11pub(crate) type Bitmap = u16;
  12
  13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
  14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
  15
  16#[derive(Clone, Debug, Default)]
  17pub struct Chunk {
  18    /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
  19    /// text.
  20    chars: Bitmap,
  21    /// The number of set bits is the number of UTF-16 code units it would take to represent the
  22    /// text.
  23    ///
  24    /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
  25    /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
  26    /// than two UTF-16 code units.)
  27    chars_utf16: Bitmap,
  28    /// If bit[i] is set, then the character at index i is an ascii newline.
  29    newlines: Bitmap,
  30    /// If bit[i] is set, then the character at index i is an ascii tab.
  31    tabs: Bitmap,
  32    pub text: ArrayString<MAX_BASE>,
  33}
  34
  35#[inline(always)]
  36const fn saturating_shl_mask(offset: u32) -> Bitmap {
  37    (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
  38}
  39
  40#[inline(always)]
  41const fn saturating_shr_mask(offset: u32) -> Bitmap {
  42    !Bitmap::MAX.unbounded_shr(offset)
  43}
  44
  45impl Chunk {
  46    pub const MASK_BITS: usize = Bitmap::BITS as usize;
  47
  48    #[inline(always)]
  49    pub fn new(text: &str) -> Self {
  50        let text = ArrayString::from(text).unwrap();
  51
  52        const CHUNK_SIZE: usize = 8;
  53
  54        let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
  55        let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
  56        let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
  57        let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
  58
  59        let mut chunk_ix = 0;
  60
  61        let mut bytes = text.as_bytes();
  62        while !bytes.is_empty() {
  63            let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
  64            bytes = rest;
  65
  66            let mut chars = 0;
  67            let mut newlines = 0;
  68            let mut tabs = 0;
  69            let mut chars_utf16 = 0;
  70
  71            for (ix, &b) in chunk.iter().enumerate() {
  72                chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
  73                newlines |= ((b == b'\n') as u8) << ix;
  74                tabs |= ((b == b'\t') as u8) << ix;
  75                // b >= 240 when we are at the first byte of the 4 byte encoded
  76                // utf-8 code point (U+010000 or greater) it means that it would
  77                // be encoded as two 16-bit code units in utf-16
  78                chars_utf16 |= ((b >= 240) as u8) << ix;
  79            }
  80
  81            chars_bytes[chunk_ix] = chars;
  82            newlines_bytes[chunk_ix] = newlines;
  83            tabs_bytes[chunk_ix] = tabs;
  84            chars_utf16_bytes[chunk_ix] = chars_utf16;
  85
  86            chunk_ix += 1;
  87        }
  88
  89        let chars = Bitmap::from_le_bytes(chars_bytes);
  90
  91        Chunk {
  92            text,
  93            chars,
  94            chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
  95            newlines: Bitmap::from_le_bytes(newlines_bytes),
  96            tabs: Bitmap::from_le_bytes(tabs_bytes),
  97        }
  98    }
  99
 100    #[inline(always)]
 101    pub fn push_str(&mut self, text: &str) {
 102        self.append(Chunk::new(text).as_slice());
 103    }
 104
 105    #[inline(always)]
 106    pub fn prepend_str(&mut self, text: &str) {
 107        self.prepend(Chunk::new(text).as_slice());
 108    }
 109
 110    #[inline(always)]
 111    pub fn append(&mut self, slice: ChunkSlice) {
 112        if slice.is_empty() {
 113            return;
 114        };
 115
 116        let base_ix = self.text.len();
 117        self.chars |= slice.chars << base_ix;
 118        self.chars_utf16 |= slice.chars_utf16 << base_ix;
 119        self.newlines |= slice.newlines << base_ix;
 120        self.tabs |= slice.tabs << base_ix;
 121        self.text.push_str(slice.text);
 122    }
 123
 124    #[inline(always)]
 125    pub fn prepend(&mut self, slice: ChunkSlice) {
 126        if slice.is_empty() {
 127            return;
 128        }
 129        if self.text.is_empty() {
 130            *self = Chunk::new(slice.text);
 131            return;
 132        }
 133
 134        let shift = slice.text.len();
 135        self.chars = slice.chars | (self.chars << shift);
 136        self.chars_utf16 = slice.chars_utf16 | (self.chars_utf16 << shift);
 137        self.newlines = slice.newlines | (self.newlines << shift);
 138        self.tabs = slice.tabs | (self.tabs << shift);
 139
 140        let mut new_text = ArrayString::<MAX_BASE>::new();
 141        new_text.push_str(slice.text);
 142        new_text.push_str(&self.text);
 143        self.text = new_text;
 144    }
 145
 146    #[inline(always)]
 147    pub fn as_slice(&self) -> ChunkSlice<'_> {
 148        ChunkSlice {
 149            chars: self.chars,
 150            chars_utf16: self.chars_utf16,
 151            newlines: self.newlines,
 152            tabs: self.tabs,
 153            text: &self.text,
 154        }
 155    }
 156
 157    #[inline(always)]
 158    pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
 159        self.as_slice().slice(range)
 160    }
 161
 162    #[inline(always)]
 163    pub fn chars(&self) -> Bitmap {
 164        self.chars
 165    }
 166
 167    #[inline(always)]
 168    pub fn tabs(&self) -> Bitmap {
 169        self.tabs
 170    }
 171
 172    #[inline(always)]
 173    pub fn newlines(&self) -> Bitmap {
 174        self.newlines
 175    }
 176
 177    #[inline(always)]
 178    pub fn is_char_boundary(&self, offset: usize) -> bool {
 179        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 180    }
 181
 182    pub fn floor_char_boundary(&self, index: usize) -> usize {
 183        if index >= self.text.len() {
 184            self.text.len()
 185        } else {
 186            let mut i = index;
 187            while i > 0 {
 188                if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
 189                    break;
 190                }
 191                i -= 1;
 192            }
 193
 194            i
 195        }
 196    }
 197
 198    #[track_caller]
 199    #[inline(always)]
 200    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 201        if self.is_char_boundary(offset) {
 202            return true;
 203        }
 204        if PANIC || cfg!(debug_assertions) {
 205            panic_char_boundary(&self.text, offset);
 206        } else {
 207            log_err_char_boundary(&self.text, offset);
 208            false
 209        }
 210    }
 211}
 212
 213#[derive(Clone, Copy, Debug)]
 214pub struct ChunkSlice<'a> {
 215    chars: Bitmap,
 216    chars_utf16: Bitmap,
 217    newlines: Bitmap,
 218    tabs: Bitmap,
 219    text: &'a str,
 220}
 221
 222impl Into<Chunk> for ChunkSlice<'_> {
 223    fn into(self) -> Chunk {
 224        Chunk {
 225            chars: self.chars,
 226            chars_utf16: self.chars_utf16,
 227            newlines: self.newlines,
 228            tabs: self.tabs,
 229            text: self.text.try_into().unwrap(),
 230        }
 231    }
 232}
 233
 234impl<'a> ChunkSlice<'a> {
 235    #[inline(always)]
 236    pub fn is_empty(&self) -> bool {
 237        self.text.is_empty()
 238    }
 239
 240    #[inline(always)]
 241    pub fn is_char_boundary(&self, offset: usize) -> bool {
 242        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 243    }
 244
 245    #[inline(always)]
 246    pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
 247        if mid == MAX_BASE {
 248            let left = self;
 249            let right = ChunkSlice {
 250                chars: 0,
 251                chars_utf16: 0,
 252                newlines: 0,
 253                tabs: 0,
 254                text: "",
 255            };
 256            (left, right)
 257        } else {
 258            let mask = ((1 as Bitmap) << mid) - 1;
 259            let (left_text, right_text) = self.text.split_at(mid);
 260            let left = ChunkSlice {
 261                chars: self.chars & mask,
 262                chars_utf16: self.chars_utf16 & mask,
 263                newlines: self.newlines & mask,
 264                tabs: self.tabs & mask,
 265                text: left_text,
 266            };
 267            let right = ChunkSlice {
 268                chars: self.chars >> mid,
 269                chars_utf16: self.chars_utf16 >> mid,
 270                newlines: self.newlines >> mid,
 271                tabs: self.tabs >> mid,
 272                text: right_text,
 273            };
 274            (left, right)
 275        }
 276    }
 277
 278    #[inline(always)]
 279    pub fn slice(self, mut range: Range<usize>) -> Self {
 280        if range.start == MAX_BASE {
 281            Self {
 282                chars: 0,
 283                chars_utf16: 0,
 284                newlines: 0,
 285                tabs: 0,
 286                text: "",
 287            }
 288        } else {
 289            if !self.assert_char_boundary::<false>(range.start) {
 290                range.start = self.text.ceil_char_boundary(range.start);
 291            }
 292            if !self.assert_char_boundary::<false>(range.end) {
 293                range.end = if range.end < range.start {
 294                    range.start
 295                } else {
 296                    self.text.floor_char_boundary(range.end)
 297                };
 298            }
 299            let mask = (1 as Bitmap)
 300                .unbounded_shl(range.end as u32)
 301                .wrapping_sub(1);
 302            Self {
 303                chars: (self.chars & mask) >> range.start,
 304                chars_utf16: (self.chars_utf16 & mask) >> range.start,
 305                newlines: (self.newlines & mask) >> range.start,
 306                tabs: (self.tabs & mask) >> range.start,
 307                text: &self.text[range],
 308            }
 309        }
 310    }
 311
 312    #[inline(always)]
 313    pub fn text_summary(&self) -> TextSummary {
 314        let mut chars = 0;
 315        let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
 316        TextSummary {
 317            len: self.len(),
 318            chars,
 319            len_utf16: self.len_utf16(),
 320            lines: self.lines(),
 321            first_line_chars: self.first_line_chars(),
 322            last_line_chars: self.last_line_chars(),
 323            last_line_len_utf16: self.last_line_len_utf16(),
 324            longest_row,
 325            longest_row_chars,
 326        }
 327    }
 328
 329    /// Get length in bytes
 330    #[inline(always)]
 331    pub fn len(&self) -> usize {
 332        self.text.len()
 333    }
 334
 335    /// Get length in UTF-16 code units
 336    #[inline(always)]
 337    pub fn len_utf16(&self) -> OffsetUtf16 {
 338        OffsetUtf16(self.chars_utf16.count_ones() as usize)
 339    }
 340
 341    /// Get point representing number of lines and length of last line
 342    #[inline(always)]
 343    pub fn lines(&self) -> Point {
 344        let row = self.newlines.count_ones();
 345        let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
 346        Point::new(row, column)
 347    }
 348
 349    /// Get number of chars in first line
 350    #[inline(always)]
 351    pub fn first_line_chars(&self) -> u32 {
 352        (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
 353    }
 354
 355    /// Get number of chars in last line
 356    #[inline(always)]
 357    pub fn last_line_chars(&self) -> u32 {
 358        (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 359    }
 360
 361    /// Get number of UTF-16 code units in last line
 362    #[inline(always)]
 363    pub fn last_line_len_utf16(&self) -> u32 {
 364        (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 365    }
 366
 367    /// Get the longest row in the chunk and its length in characters.
 368    /// Calculate the total number of characters in the chunk along the way.
 369    #[inline(always)]
 370    pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
 371        let mut chars = self.chars;
 372        let mut newlines = self.newlines;
 373        *total_chars = 0;
 374        let mut row = 0;
 375        let mut longest_row = 0;
 376        let mut longest_row_chars = 0;
 377        while newlines > 0 {
 378            let newline_ix = newlines.trailing_zeros();
 379            let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
 380            *total_chars += usize::from(row_chars);
 381            if row_chars > longest_row_chars {
 382                longest_row = row;
 383                longest_row_chars = row_chars;
 384            }
 385
 386            newlines >>= newline_ix;
 387            newlines >>= 1;
 388            chars >>= newline_ix;
 389            chars >>= 1;
 390            row += 1;
 391            *total_chars += 1;
 392        }
 393
 394        let row_chars = chars.count_ones() as u8;
 395        *total_chars += usize::from(row_chars);
 396        if row_chars > longest_row_chars {
 397            (row, row_chars as u32)
 398        } else {
 399            (longest_row, longest_row_chars as u32)
 400        }
 401    }
 402
 403    #[inline(always)]
 404    pub fn offset_to_point(&self, offset: usize) -> Point {
 405        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 406        let row = (self.newlines & mask).count_ones();
 407        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 408        let column = (offset - newline_ix as usize) as u32;
 409        Point::new(row, column)
 410    }
 411
 412    #[inline(always)]
 413    pub fn point_to_offset(&self, point: Point) -> usize {
 414        if point.row > self.lines().row {
 415            debug_panic!(
 416                "point {:?} extends beyond rows for string {:?}",
 417                point,
 418                self.text
 419            );
 420            return self.len();
 421        }
 422
 423        let row_offset_range = self.offset_range_for_row(point.row);
 424        if point.column > row_offset_range.len() as u32 {
 425            debug_panic!(
 426                "point {:?} extends beyond row for string {:?}",
 427                point,
 428                self.text
 429            );
 430            row_offset_range.end
 431        } else {
 432            row_offset_range.start + point.column as usize
 433        }
 434    }
 435
 436    #[track_caller]
 437    #[inline(always)]
 438    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 439        if self.is_char_boundary(offset) {
 440            return true;
 441        }
 442        if PANIC {
 443            panic_char_boundary(self.text, offset);
 444        } else {
 445            log_err_char_boundary(self.text, offset);
 446            false
 447        }
 448    }
 449
 450    pub fn floor_char_boundary(&self, index: usize) -> usize {
 451        self.text.floor_char_boundary(index)
 452    }
 453
 454    #[inline(always)]
 455    pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
 456        if point.row > self.lines().row {
 457            debug_panic!(
 458                "point {:?} extends beyond rows for string {:?}",
 459                point,
 460                self.text
 461            );
 462            return self.len_utf16();
 463        }
 464        self.offset_to_offset_utf16(self.point_to_offset(point))
 465    }
 466
 467    #[inline(always)]
 468    pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
 469        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 470        OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
 471    }
 472
 473    #[inline(always)]
 474    pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
 475        if target.0 == 0 {
 476            0
 477        } else {
 478            #[cfg(not(test))]
 479            let chars_utf16 = self.chars_utf16;
 480            #[cfg(test)]
 481            let chars_utf16 = self.chars_utf16 as u128;
 482            let ix = nth_set_bit(chars_utf16, target.0) + 1;
 483            if ix == MAX_BASE {
 484                MAX_BASE
 485            } else {
 486                let utf8_additional_len = cmp::min(
 487                    (self.chars_utf16 >> ix).trailing_zeros() as usize,
 488                    self.text.len() - ix,
 489                );
 490                ix + utf8_additional_len
 491            }
 492        }
 493    }
 494
 495    #[inline(always)]
 496    pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
 497        let mask = saturating_shl_mask(offset as u32);
 498        let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
 499        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 500        let column = if newline_ix as usize == MAX_BASE {
 501            0
 502        } else {
 503            ((self.chars_utf16 & mask) >> newline_ix).count_ones()
 504        };
 505        PointUtf16::new(row, column)
 506    }
 507
 508    #[inline(always)]
 509    pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
 510        self.offset_to_point_utf16(self.point_to_offset(point))
 511    }
 512
 513    #[inline(always)]
 514    pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
 515        let lines = self.lines();
 516        if point.row > lines.row {
 517            if !clip {
 518                debug_panic!(
 519                    "point {:?} is beyond this chunk's extent {:?}",
 520                    point,
 521                    self.text
 522                );
 523            }
 524            return self.len();
 525        }
 526
 527        let row_offset_range = self.offset_range_for_row(point.row);
 528        let line = self.slice(row_offset_range.clone());
 529        if point.column > line.last_line_len_utf16() {
 530            if !clip {
 531                debug_panic!(
 532                    "point {:?} is beyond the end of the line in chunk {:?}",
 533                    point,
 534                    self.text
 535                );
 536            }
 537            return row_offset_range.end;
 538        }
 539
 540        let mut offset = row_offset_range.start;
 541        if point.column > 0 {
 542            offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
 543            if !self.text.is_char_boundary(offset) {
 544                offset -= 1;
 545                while !self.text.is_char_boundary(offset) {
 546                    offset -= 1;
 547                }
 548                if !clip {
 549                    debug_panic!(
 550                        "point {:?} is within character in chunk {:?}",
 551                        point,
 552                        self.text,
 553                    );
 554                }
 555            }
 556        }
 557        offset
 558    }
 559
 560    #[inline(always)]
 561    pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
 562        let max_point = self.lines();
 563        if point.0.row > max_point.row {
 564            return max_point;
 565        }
 566
 567        let row_offset_range = self.offset_range_for_row(point.0.row);
 568        let line = self.slice(row_offset_range);
 569        if point.0.column == 0 {
 570            Point::new(point.0.row, 0)
 571        } else if point.0.column >= line.len_utf16().0 as u32 {
 572            Point::new(point.0.row, line.len() as u32)
 573        } else {
 574            let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
 575            while !line.text.is_char_boundary(column) {
 576                column -= 1;
 577            }
 578            Point::new(point.0.row, column as u32)
 579        }
 580    }
 581
 582    #[inline(always)]
 583    pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
 584        let max_point = self.lines();
 585        if point.row > max_point.row {
 586            return max_point;
 587        }
 588
 589        let line = self.slice(self.offset_range_for_row(point.row));
 590        if point.column == 0 {
 591            point
 592        } else if point.column >= line.len() as u32 {
 593            Point::new(point.row, line.len() as u32)
 594        } else {
 595            let mut column = point.column as usize;
 596            let bytes = line.text.as_bytes();
 597            if bytes[column - 1] < 128 && bytes[column] < 128 {
 598                return Point::new(point.row, column as u32);
 599            }
 600
 601            let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
 602            loop {
 603                if line.is_char_boundary(column)
 604                    && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
 605                {
 606                    break;
 607                }
 608
 609                match bias {
 610                    Bias::Left => column -= 1,
 611                    Bias::Right => column += 1,
 612                }
 613                grapheme_cursor.set_cursor(column);
 614            }
 615            Point::new(point.row, column as u32)
 616        }
 617    }
 618
 619    #[inline(always)]
 620    pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
 621        let max_point = self.lines();
 622        if point.0.row > max_point.row {
 623            PointUtf16::new(max_point.row, self.last_line_len_utf16())
 624        } else {
 625            let line = self.slice(self.offset_range_for_row(point.0.row));
 626            let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
 627            PointUtf16::new(point.0.row, column.0 as u32)
 628        }
 629    }
 630
 631    #[inline(always)]
 632    pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
 633        if target == OffsetUtf16::default() {
 634            OffsetUtf16::default()
 635        } else if target >= self.len_utf16() {
 636            self.len_utf16()
 637        } else {
 638            let mut offset = self.offset_utf16_to_offset(target);
 639            while !self.text.is_char_boundary(offset) {
 640                if bias == Bias::Left {
 641                    offset -= 1;
 642                } else {
 643                    offset += 1;
 644                }
 645            }
 646            self.offset_to_offset_utf16(offset)
 647        }
 648    }
 649
 650    #[inline(always)]
 651    fn offset_range_for_row(&self, row: u32) -> Range<usize> {
 652        let row_start = if row > 0 {
 653            #[cfg(not(test))]
 654            let newlines = self.newlines;
 655            #[cfg(test)]
 656            let newlines = self.newlines as u128;
 657            nth_set_bit(newlines, row as usize) + 1
 658        } else {
 659            0
 660        };
 661        let row_len = if row_start == MAX_BASE {
 662            0
 663        } else {
 664            cmp::min(
 665                (self.newlines >> row_start).trailing_zeros(),
 666                (self.text.len() - row_start) as u32,
 667            )
 668        };
 669        row_start..row_start + row_len as usize
 670    }
 671
 672    #[inline(always)]
 673    pub fn tabs(&self) -> Tabs {
 674        Tabs {
 675            tabs: self.tabs,
 676            chars: self.chars,
 677        }
 678    }
 679}
 680
 681pub struct Tabs {
 682    tabs: Bitmap,
 683    chars: Bitmap,
 684}
 685
 686#[derive(Debug, PartialEq, Eq)]
 687pub struct TabPosition {
 688    pub byte_offset: usize,
 689    pub char_offset: usize,
 690}
 691
 692impl Iterator for Tabs {
 693    type Item = TabPosition;
 694
 695    fn next(&mut self) -> Option<Self::Item> {
 696        if self.tabs == 0 {
 697            return None;
 698        }
 699
 700        let tab_offset = self.tabs.trailing_zeros() as usize;
 701        let chars_mask = (1 << tab_offset) - 1;
 702        let char_offset = (self.chars & chars_mask).count_ones() as usize;
 703
 704        // Since tabs are 1 byte the tab offset is the same as the byte offset
 705        let position = TabPosition {
 706            byte_offset: tab_offset,
 707            char_offset,
 708        };
 709        // Remove the tab we've just seen
 710        self.tabs ^= 1 << tab_offset;
 711
 712        Some(position)
 713    }
 714}
 715
 716/// Finds the n-th bit that is set to 1.
 717#[inline(always)]
 718fn nth_set_bit(v: u128, n: usize) -> usize {
 719    let low = v as u64;
 720    let high = (v >> 64) as u64;
 721
 722    let low_count = low.count_ones() as usize;
 723    if n > low_count {
 724        64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
 725    } else {
 726        nth_set_bit_u64(low, n as u64) as usize
 727    }
 728}
 729
 730#[cold]
 731#[inline(never)]
 732#[track_caller]
 733fn panic_char_boundary(text: &str, offset: usize) -> ! {
 734    if offset > text.len() {
 735        panic!(
 736            "byte index {} is out of bounds of `{:?}` (length: {})",
 737            offset,
 738            text,
 739            text.len()
 740        );
 741    }
 742    // find the character
 743    let char_start = text.floor_char_boundary(offset);
 744    // `char_start` must be less than len and a char boundary
 745    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 746    let char_range = char_start..char_start + ch.len_utf8();
 747    panic!(
 748        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 749        offset, ch, char_range,
 750    );
 751}
 752
 753#[cold]
 754#[inline(never)]
 755#[track_caller]
 756fn log_err_char_boundary(text: &str, offset: usize) {
 757    if offset >= text.len() {
 758        log::error!(
 759            "byte index {} is out of bounds of `{:?}` (length: {})",
 760            offset,
 761            text,
 762            text.len()
 763        );
 764        return;
 765    }
 766    // find the character
 767    let char_start = text.floor_char_boundary(offset);
 768    // `char_start` must be less than len and a char boundary
 769    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 770    let char_range = char_start..char_start + ch.len_utf8();
 771    log::error!(
 772        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 773        offset,
 774        ch,
 775        char_range,
 776    );
 777}
 778
 779#[inline(always)]
 780fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
 781    let v = v.reverse_bits();
 782    let mut s: u64 = 64;
 783
 784    // Parallel bit count intermediates
 785    let a = v - ((v >> 1) & (u64::MAX / 3));
 786    let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
 787    let c = (b + (b >> 4)) & (u64::MAX / 0x11);
 788    let d = (c + (c >> 8)) & (u64::MAX / 0x101);
 789
 790    // Branchless select
 791    let t = (d >> 32) + (d >> 48);
 792    s -= (t.wrapping_sub(n) & 256) >> 3;
 793    n -= t & (t.wrapping_sub(n) >> 8);
 794
 795    let t = (d >> (s - 16)) & 0xff;
 796    s -= (t.wrapping_sub(n) & 256) >> 4;
 797    n -= t & (t.wrapping_sub(n) >> 8);
 798
 799    let t = (c >> (s - 8)) & 0xf;
 800    s -= (t.wrapping_sub(n) & 256) >> 5;
 801    n -= t & (t.wrapping_sub(n) >> 8);
 802
 803    let t = (b >> (s - 4)) & 0x7;
 804    s -= (t.wrapping_sub(n) & 256) >> 6;
 805    n -= t & (t.wrapping_sub(n) >> 8);
 806
 807    let t = (a >> (s - 2)) & 0x3;
 808    s -= (t.wrapping_sub(n) & 256) >> 7;
 809    n -= t & (t.wrapping_sub(n) >> 8);
 810
 811    let t = (v >> (s - 1)) & 0x1;
 812    s -= (t.wrapping_sub(n) & 256) >> 8;
 813
 814    65 - s - 1
 815}
 816
 817#[cfg(test)]
 818mod tests {
 819    use super::*;
 820    use rand::prelude::*;
 821    use util::RandomCharIter;
 822
 823    #[gpui::test(iterations = 100)]
 824    fn test_random_chunks(mut rng: StdRng) {
 825        let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
 826        log::info!("Chunk: {:?}", text);
 827        let chunk = Chunk::new(&text);
 828        verify_chunk(chunk.as_slice(), &text);
 829
 830        // Verify Chunk::chars() bitmap
 831        let expected_chars = char_offsets(&text)
 832            .into_iter()
 833            .inspect(|i| assert!(*i < MAX_BASE))
 834            .fold(0 as Bitmap, |acc, i| acc | (1 << i));
 835        assert_eq!(chunk.chars(), expected_chars);
 836
 837        for _ in 0..10 {
 838            let mut start = rng.random_range(0..=chunk.text.len());
 839            let mut end = rng.random_range(start..=chunk.text.len());
 840            while !chunk.text.is_char_boundary(start) {
 841                start -= 1;
 842            }
 843            while !chunk.text.is_char_boundary(end) {
 844                end -= 1;
 845            }
 846            let range = start..end;
 847            log::info!("Range: {:?}", range);
 848            let text_slice = &text[range.clone()];
 849            let chunk_slice = chunk.slice(range);
 850            verify_chunk(chunk_slice, text_slice);
 851        }
 852    }
 853
 854    #[gpui::test(iterations = 100)]
 855    fn test_split_chunk_slice(mut rng: StdRng) {
 856        let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
 857        let chunk = Chunk::new(text);
 858        let offset = char_offsets_with_end(text)
 859            .into_iter()
 860            .choose(&mut rng)
 861            .unwrap();
 862        let (a, b) = chunk.as_slice().split_at(offset);
 863        let (a_str, b_str) = text.split_at(offset);
 864        verify_chunk(a, a_str);
 865        verify_chunk(b, b_str);
 866    }
 867
 868    #[gpui::test(iterations = 1000)]
 869    fn test_nth_set_bit_random(mut rng: StdRng) {
 870        let set_count = rng.random_range(0..=128);
 871        let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
 872        set_bits.sort();
 873        let mut n = 0;
 874        for ix in set_bits.iter().copied() {
 875            n |= 1 << ix;
 876        }
 877
 878        for (mut ix, position) in set_bits.into_iter().enumerate() {
 879            ix += 1;
 880            assert_eq!(
 881                nth_set_bit(n, ix),
 882                position,
 883                "nth_set_bit({:0128b}, {})",
 884                n,
 885                ix
 886            );
 887        }
 888    }
 889
 890    /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
 891    fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
 892        let mut str = String::new();
 893        let mut chars = RandomCharIter::new(rng);
 894        loop {
 895            let ch = chars.next().unwrap();
 896            if str.len() + ch.len_utf8() > len {
 897                break;
 898            }
 899            str.push(ch);
 900        }
 901        str
 902    }
 903
 904    #[gpui::test(iterations = 1000)]
 905    fn test_append_random_strings(mut rng: StdRng) {
 906        let len1 = rng.random_range(0..=MAX_BASE);
 907        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 908        let str1 = random_string_with_utf8_len(&mut rng, len1);
 909        let str2 = random_string_with_utf8_len(&mut rng, len2);
 910        let mut chunk1 = Chunk::new(&str1);
 911        let chunk2 = Chunk::new(&str2);
 912        let char_offsets = char_offsets_with_end(&str2);
 913        let start_index = rng.random_range(0..char_offsets.len());
 914        let start_offset = char_offsets[start_index];
 915        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 916        chunk1.append(chunk2.slice(start_offset..end_offset));
 917        verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
 918    }
 919
 920    #[gpui::test(iterations = 1000)]
 921    fn test_prepend_random_strings(mut rng: StdRng) {
 922        let len1 = rng.random_range(0..=MAX_BASE);
 923        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 924        let str1 = random_string_with_utf8_len(&mut rng, len1);
 925        let str2 = random_string_with_utf8_len(&mut rng, len2);
 926        let mut chunk1 = Chunk::new(&str1);
 927        let chunk2 = Chunk::new(&str2);
 928        let char_offsets = char_offsets_with_end(&str2);
 929        let start_index = rng.random_range(0..char_offsets.len());
 930        let start_offset = char_offsets[start_index];
 931        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 932        let slice = chunk2.slice(start_offset..end_offset);
 933        let prefix_text = &str2[start_offset..end_offset];
 934        chunk1.prepend(slice);
 935        verify_chunk(chunk1.as_slice(), &(prefix_text.to_owned() + &str1));
 936    }
 937
 938    /// Return the byte offsets for each character in a string.
 939    ///
 940    /// These are valid offsets to split the string.
 941    fn char_offsets(text: &str) -> Vec<usize> {
 942        text.char_indices().map(|(i, _c)| i).collect()
 943    }
 944
 945    /// Return the byte offsets for each character in a string, plus the offset
 946    /// past the end of the string.
 947    fn char_offsets_with_end(text: &str) -> Vec<usize> {
 948        let mut v = char_offsets(text);
 949        v.push(text.len());
 950        v
 951    }
 952
 953    fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
 954        let mut offset = 0;
 955        let mut offset_utf16 = OffsetUtf16(0);
 956        let mut point = Point::zero();
 957        let mut point_utf16 = PointUtf16::zero();
 958
 959        log::info!("Verifying chunk {:?}", text);
 960        assert_eq!(chunk.offset_to_point(0), Point::zero());
 961
 962        let mut expected_tab_positions = Vec::new();
 963
 964        for (char_offset, c) in text.chars().enumerate() {
 965            let expected_point = chunk.offset_to_point(offset);
 966            assert_eq!(point, expected_point, "mismatch at offset {}", offset);
 967            assert_eq!(
 968                chunk.point_to_offset(point),
 969                offset,
 970                "mismatch at point {:?}",
 971                point
 972            );
 973            assert_eq!(
 974                chunk.offset_to_offset_utf16(offset),
 975                offset_utf16,
 976                "mismatch at offset {}",
 977                offset
 978            );
 979            assert_eq!(
 980                chunk.offset_utf16_to_offset(offset_utf16),
 981                offset,
 982                "mismatch at offset_utf16 {:?}",
 983                offset_utf16
 984            );
 985            assert_eq!(
 986                chunk.point_to_point_utf16(point),
 987                point_utf16,
 988                "mismatch at point {:?}",
 989                point
 990            );
 991            assert_eq!(
 992                chunk.point_utf16_to_offset(point_utf16, false),
 993                offset,
 994                "mismatch at point_utf16 {:?}",
 995                point_utf16
 996            );
 997            assert_eq!(
 998                chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
 999                point,
1000                "mismatch for unclipped_point_utf16_to_point at {:?}",
1001                point_utf16
1002            );
1003
1004            assert_eq!(
1005                chunk.clip_point(point, Bias::Left),
1006                point,
1007                "incorrect left clip at {:?}",
1008                point
1009            );
1010            assert_eq!(
1011                chunk.clip_point(point, Bias::Right),
1012                point,
1013                "incorrect right clip at {:?}",
1014                point
1015            );
1016
1017            for i in 1..c.len_utf8() {
1018                let test_point = Point::new(point.row, point.column + i as u32);
1019                assert_eq!(
1020                    chunk.clip_point(test_point, Bias::Left),
1021                    point,
1022                    "incorrect left clip within multi-byte char at {:?}",
1023                    test_point
1024                );
1025                assert_eq!(
1026                    chunk.clip_point(test_point, Bias::Right),
1027                    Point::new(point.row, point.column + c.len_utf8() as u32),
1028                    "incorrect right clip within multi-byte char at {:?}",
1029                    test_point
1030                );
1031            }
1032
1033            for i in 1..c.len_utf16() {
1034                let test_point = Unclipped(PointUtf16::new(
1035                    point_utf16.row,
1036                    point_utf16.column + i as u32,
1037                ));
1038                assert_eq!(
1039                    chunk.unclipped_point_utf16_to_point(test_point),
1040                    point,
1041                    "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
1042                    test_point
1043                );
1044                assert_eq!(
1045                    chunk.clip_point_utf16(test_point, Bias::Left),
1046                    point_utf16,
1047                    "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1048                    test_point
1049                );
1050                assert_eq!(
1051                    chunk.clip_point_utf16(test_point, Bias::Right),
1052                    PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1053                    "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1054                    test_point
1055                );
1056
1057                let test_offset = OffsetUtf16(offset_utf16.0 + i);
1058                assert_eq!(
1059                    chunk.clip_offset_utf16(test_offset, Bias::Left),
1060                    offset_utf16,
1061                    "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1062                    test_offset
1063                );
1064                assert_eq!(
1065                    chunk.clip_offset_utf16(test_offset, Bias::Right),
1066                    OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1067                    "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1068                    test_offset
1069                );
1070            }
1071
1072            if c == '\n' {
1073                point.row += 1;
1074                point.column = 0;
1075                point_utf16.row += 1;
1076                point_utf16.column = 0;
1077            } else {
1078                point.column += c.len_utf8() as u32;
1079                point_utf16.column += c.len_utf16() as u32;
1080            }
1081
1082            if c == '\t' {
1083                expected_tab_positions.push(TabPosition {
1084                    byte_offset: offset,
1085                    char_offset,
1086                });
1087            }
1088
1089            offset += c.len_utf8();
1090            offset_utf16.0 += c.len_utf16();
1091        }
1092
1093        let final_point = chunk.offset_to_point(offset);
1094        assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1095        assert_eq!(
1096            chunk.point_to_offset(point),
1097            offset,
1098            "mismatch at point {:?}",
1099            point
1100        );
1101        assert_eq!(
1102            chunk.offset_to_offset_utf16(offset),
1103            offset_utf16,
1104            "mismatch at offset {}",
1105            offset
1106        );
1107        assert_eq!(
1108            chunk.offset_utf16_to_offset(offset_utf16),
1109            offset,
1110            "mismatch at offset_utf16 {:?}",
1111            offset_utf16
1112        );
1113        assert_eq!(
1114            chunk.point_to_point_utf16(point),
1115            point_utf16,
1116            "mismatch at final point {:?}",
1117            point
1118        );
1119        assert_eq!(
1120            chunk.point_utf16_to_offset(point_utf16, false),
1121            offset,
1122            "mismatch at final point_utf16 {:?}",
1123            point_utf16
1124        );
1125        assert_eq!(
1126            chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1127            point,
1128            "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1129            point_utf16
1130        );
1131        assert_eq!(
1132            chunk.clip_point(point, Bias::Left),
1133            point,
1134            "incorrect left clip at final point {:?}",
1135            point
1136        );
1137        assert_eq!(
1138            chunk.clip_point(point, Bias::Right),
1139            point,
1140            "incorrect right clip at final point {:?}",
1141            point
1142        );
1143        assert_eq!(
1144            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1145            point_utf16,
1146            "incorrect left clip_point_utf16 at final point {:?}",
1147            point_utf16
1148        );
1149        assert_eq!(
1150            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1151            point_utf16,
1152            "incorrect right clip_point_utf16 at final point {:?}",
1153            point_utf16
1154        );
1155        assert_eq!(
1156            chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1157            offset_utf16,
1158            "incorrect left clip_offset_utf16 at final offset {:?}",
1159            offset_utf16
1160        );
1161        assert_eq!(
1162            chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1163            offset_utf16,
1164            "incorrect right clip_offset_utf16 at final offset {:?}",
1165            offset_utf16
1166        );
1167
1168        // Verify length methods
1169        assert_eq!(chunk.len(), text.len());
1170        assert_eq!(
1171            chunk.len_utf16().0,
1172            text.chars().map(|c| c.len_utf16()).sum::<usize>()
1173        );
1174
1175        // Verify line counting
1176        let lines = chunk.lines();
1177        let mut newline_count = 0;
1178        let mut last_line_len = 0;
1179        for c in text.chars() {
1180            if c == '\n' {
1181                newline_count += 1;
1182                last_line_len = 0;
1183            } else {
1184                last_line_len += c.len_utf8() as u32;
1185            }
1186        }
1187        assert_eq!(lines, Point::new(newline_count, last_line_len));
1188
1189        // Verify first/last line chars
1190        if !text.is_empty() {
1191            let first_line = text.split('\n').next().unwrap();
1192            assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1193
1194            let last_line = text.split('\n').next_back().unwrap();
1195            assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1196            assert_eq!(
1197                chunk.last_line_len_utf16(),
1198                last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1199            );
1200        }
1201
1202        // Verify longest row
1203        let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1204        let mut max_chars = 0;
1205        let mut current_row = 0;
1206        let mut current_chars = 0;
1207        let mut max_row = 0;
1208
1209        for c in text.chars() {
1210            if c == '\n' {
1211                if current_chars > max_chars {
1212                    max_chars = current_chars;
1213                    max_row = current_row;
1214                }
1215                current_row += 1;
1216                current_chars = 0;
1217            } else {
1218                current_chars += 1;
1219            }
1220        }
1221
1222        if current_chars > max_chars {
1223            max_chars = current_chars;
1224            max_row = current_row;
1225        }
1226
1227        assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1228        assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1229    }
1230
1231    #[gpui::test]
1232    fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1233        let text = "abc\nde";
1234        let chunk = Chunk::new(text);
1235        let slice = chunk.as_slice();
1236
1237        // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1238        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1239
1240        // Clipping on row 1 — this is the case that was buggy.
1241        // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1242        // clipped result must be 6, not 2.
1243        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1244    }
1245}