chunk.rs

   1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
   2use arrayvec::ArrayString;
   3use std::{cmp, ops::Range};
   4use sum_tree::Bias;
   5use unicode_segmentation::GraphemeCursor;
   6use util::debug_panic;
   7
   8#[cfg(not(all(test, not(rust_analyzer))))]
   9pub(crate) type Bitmap = u128;
  10#[cfg(all(test, not(rust_analyzer)))]
  11pub(crate) type Bitmap = u16;
  12
  13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
  14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
  15
  16#[derive(Clone, Debug, Default)]
  17pub struct Chunk {
  18    /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
  19    /// text.
  20    chars: Bitmap,
  21    /// The number of set bits is the number of UTF-16 code units it would take to represent the
  22    /// text.
  23    ///
  24    /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
  25    /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
  26    /// than two UTF-16 code units.)
  27    chars_utf16: Bitmap,
  28    /// If bit[i] is set, then the character at index i is an ascii newline.
  29    newlines: Bitmap,
  30    /// If bit[i] is set, then the character at index i is an ascii tab.
  31    tabs: Bitmap,
  32    pub text: ArrayString<MAX_BASE>,
  33}
  34
  35#[inline(always)]
  36const fn saturating_shl_mask(offset: u32) -> Bitmap {
  37    (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
  38}
  39
  40#[inline(always)]
  41const fn saturating_shr_mask(offset: u32) -> Bitmap {
  42    !Bitmap::MAX.unbounded_shr(offset)
  43}
  44
  45impl Chunk {
  46    pub const MASK_BITS: usize = Bitmap::BITS as usize;
  47
  48    #[inline(always)]
  49    pub fn new(text: &str) -> Self {
  50        let text = ArrayString::from(text).unwrap();
  51
  52        const CHUNK_SIZE: usize = 8;
  53
  54        let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
  55        let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
  56        let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
  57        let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
  58
  59        let mut chunk_ix = 0;
  60
  61        let mut bytes = text.as_bytes();
  62        while !bytes.is_empty() {
  63            let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
  64            bytes = rest;
  65
  66            let mut chars = 0;
  67            let mut newlines = 0;
  68            let mut tabs = 0;
  69            let mut chars_utf16 = 0;
  70
  71            for (ix, &b) in chunk.iter().enumerate() {
  72                chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
  73                newlines |= ((b == b'\n') as u8) << ix;
  74                tabs |= ((b == b'\t') as u8) << ix;
  75                // b >= 240 when we are at the first byte of the 4 byte encoded
  76                // utf-8 code point (U+010000 or greater) it means that it would
  77                // be encoded as two 16-bit code units in utf-16
  78                chars_utf16 |= ((b >= 240) as u8) << ix;
  79            }
  80
  81            chars_bytes[chunk_ix] = chars;
  82            newlines_bytes[chunk_ix] = newlines;
  83            tabs_bytes[chunk_ix] = tabs;
  84            chars_utf16_bytes[chunk_ix] = chars_utf16;
  85
  86            chunk_ix += 1;
  87        }
  88
  89        let chars = Bitmap::from_le_bytes(chars_bytes);
  90
  91        Chunk {
  92            text,
  93            chars,
  94            chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
  95            newlines: Bitmap::from_le_bytes(newlines_bytes),
  96            tabs: Bitmap::from_le_bytes(tabs_bytes),
  97        }
  98    }
  99
 100    #[inline(always)]
 101    pub fn push_str(&mut self, text: &str) {
 102        self.append(Chunk::new(text).as_slice());
 103    }
 104
 105    #[inline(always)]
 106    pub fn append(&mut self, slice: ChunkSlice) {
 107        if slice.is_empty() {
 108            return;
 109        };
 110
 111        let base_ix = self.text.len();
 112        self.chars |= slice.chars << base_ix;
 113        self.chars_utf16 |= slice.chars_utf16 << base_ix;
 114        self.newlines |= slice.newlines << base_ix;
 115        self.tabs |= slice.tabs << base_ix;
 116        self.text.push_str(slice.text);
 117    }
 118
 119    #[inline(always)]
 120    pub fn as_slice(&self) -> ChunkSlice<'_> {
 121        ChunkSlice {
 122            chars: self.chars,
 123            chars_utf16: self.chars_utf16,
 124            newlines: self.newlines,
 125            tabs: self.tabs,
 126            text: &self.text,
 127        }
 128    }
 129
 130    #[inline(always)]
 131    pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
 132        self.as_slice().slice(range)
 133    }
 134
 135    #[inline(always)]
 136    pub fn chars(&self) -> Bitmap {
 137        self.chars
 138    }
 139
 140    pub fn tabs(&self) -> Bitmap {
 141        self.tabs
 142    }
 143
 144    #[inline(always)]
 145    pub fn is_char_boundary(&self, offset: usize) -> bool {
 146        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 147    }
 148
 149    pub fn floor_char_boundary(&self, index: usize) -> usize {
 150        if index >= self.text.len() {
 151            self.text.len()
 152        } else {
 153            let mut i = index;
 154            while i > 0 {
 155                if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
 156                    break;
 157                }
 158                i -= 1;
 159            }
 160
 161            i
 162        }
 163    }
 164
 165    #[track_caller]
 166    #[inline(always)]
 167    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 168        if self.is_char_boundary(offset) {
 169            return true;
 170        }
 171        if PANIC {
 172            panic_char_boundary(&self.text, offset);
 173        } else {
 174            log_err_char_boundary(&self.text, offset);
 175            false
 176        }
 177    }
 178}
 179
 180#[derive(Clone, Copy, Debug)]
 181pub struct ChunkSlice<'a> {
 182    chars: Bitmap,
 183    chars_utf16: Bitmap,
 184    newlines: Bitmap,
 185    tabs: Bitmap,
 186    text: &'a str,
 187}
 188
 189impl Into<Chunk> for ChunkSlice<'_> {
 190    fn into(self) -> Chunk {
 191        Chunk {
 192            chars: self.chars,
 193            chars_utf16: self.chars_utf16,
 194            newlines: self.newlines,
 195            tabs: self.tabs,
 196            text: self.text.try_into().unwrap(),
 197        }
 198    }
 199}
 200
 201impl<'a> ChunkSlice<'a> {
 202    #[inline(always)]
 203    pub fn is_empty(&self) -> bool {
 204        self.text.is_empty()
 205    }
 206
 207    #[inline(always)]
 208    pub fn is_char_boundary(&self, offset: usize) -> bool {
 209        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 210    }
 211
 212    #[inline(always)]
 213    pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
 214        if mid == MAX_BASE {
 215            let left = self;
 216            let right = ChunkSlice {
 217                chars: 0,
 218                chars_utf16: 0,
 219                newlines: 0,
 220                tabs: 0,
 221                text: "",
 222            };
 223            (left, right)
 224        } else {
 225            let mask = ((1 as Bitmap) << mid) - 1;
 226            let (left_text, right_text) = self.text.split_at(mid);
 227            let left = ChunkSlice {
 228                chars: self.chars & mask,
 229                chars_utf16: self.chars_utf16 & mask,
 230                newlines: self.newlines & mask,
 231                tabs: self.tabs & mask,
 232                text: left_text,
 233            };
 234            let right = ChunkSlice {
 235                chars: self.chars >> mid,
 236                chars_utf16: self.chars_utf16 >> mid,
 237                newlines: self.newlines >> mid,
 238                tabs: self.tabs >> mid,
 239                text: right_text,
 240            };
 241            (left, right)
 242        }
 243    }
 244
 245    #[inline(always)]
 246    pub fn slice(self, mut range: Range<usize>) -> Self {
 247        if range.start == MAX_BASE {
 248            Self {
 249                chars: 0,
 250                chars_utf16: 0,
 251                newlines: 0,
 252                tabs: 0,
 253                text: "",
 254            }
 255        } else {
 256            if !self.assert_char_boundary::<false>(range.start) {
 257                range.start = self.text.ceil_char_boundary(range.start);
 258            }
 259            if !self.assert_char_boundary::<false>(range.end) {
 260                range.end = if range.end < range.start {
 261                    range.start
 262                } else {
 263                    self.text.floor_char_boundary(range.end)
 264                };
 265            }
 266            let mask = (1 as Bitmap)
 267                .unbounded_shl(range.end as u32)
 268                .wrapping_sub(1);
 269            Self {
 270                chars: (self.chars & mask) >> range.start,
 271                chars_utf16: (self.chars_utf16 & mask) >> range.start,
 272                newlines: (self.newlines & mask) >> range.start,
 273                tabs: (self.tabs & mask) >> range.start,
 274                text: &self.text[range],
 275            }
 276        }
 277    }
 278
 279    #[inline(always)]
 280    pub fn text_summary(&self) -> TextSummary {
 281        let mut chars = 0;
 282        let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
 283        TextSummary {
 284            len: self.len(),
 285            chars,
 286            len_utf16: self.len_utf16(),
 287            lines: self.lines(),
 288            first_line_chars: self.first_line_chars(),
 289            last_line_chars: self.last_line_chars(),
 290            last_line_len_utf16: self.last_line_len_utf16(),
 291            longest_row,
 292            longest_row_chars,
 293        }
 294    }
 295
 296    /// Get length in bytes
 297    #[inline(always)]
 298    pub fn len(&self) -> usize {
 299        self.text.len()
 300    }
 301
 302    /// Get length in UTF-16 code units
 303    #[inline(always)]
 304    pub fn len_utf16(&self) -> OffsetUtf16 {
 305        OffsetUtf16(self.chars_utf16.count_ones() as usize)
 306    }
 307
 308    /// Get point representing number of lines and length of last line
 309    #[inline(always)]
 310    pub fn lines(&self) -> Point {
 311        let row = self.newlines.count_ones();
 312        let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
 313        Point::new(row, column)
 314    }
 315
 316    /// Get number of chars in first line
 317    #[inline(always)]
 318    pub fn first_line_chars(&self) -> u32 {
 319        (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
 320    }
 321
 322    /// Get number of chars in last line
 323    #[inline(always)]
 324    pub fn last_line_chars(&self) -> u32 {
 325        (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 326    }
 327
 328    /// Get number of UTF-16 code units in last line
 329    #[inline(always)]
 330    pub fn last_line_len_utf16(&self) -> u32 {
 331        (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 332    }
 333
 334    /// Get the longest row in the chunk and its length in characters.
 335    /// Calculate the total number of characters in the chunk along the way.
 336    #[inline(always)]
 337    pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
 338        let mut chars = self.chars;
 339        let mut newlines = self.newlines;
 340        *total_chars = 0;
 341        let mut row = 0;
 342        let mut longest_row = 0;
 343        let mut longest_row_chars = 0;
 344        while newlines > 0 {
 345            let newline_ix = newlines.trailing_zeros();
 346            let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
 347            *total_chars += usize::from(row_chars);
 348            if row_chars > longest_row_chars {
 349                longest_row = row;
 350                longest_row_chars = row_chars;
 351            }
 352
 353            newlines >>= newline_ix;
 354            newlines >>= 1;
 355            chars >>= newline_ix;
 356            chars >>= 1;
 357            row += 1;
 358            *total_chars += 1;
 359        }
 360
 361        let row_chars = chars.count_ones() as u8;
 362        *total_chars += usize::from(row_chars);
 363        if row_chars > longest_row_chars {
 364            (row, row_chars as u32)
 365        } else {
 366            (longest_row, longest_row_chars as u32)
 367        }
 368    }
 369
 370    #[inline(always)]
 371    pub fn offset_to_point(&self, offset: usize) -> Point {
 372        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 373        let row = (self.newlines & mask).count_ones();
 374        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 375        let column = (offset - newline_ix as usize) as u32;
 376        Point::new(row, column)
 377    }
 378
 379    #[inline(always)]
 380    pub fn point_to_offset(&self, point: Point) -> usize {
 381        if point.row > self.lines().row {
 382            debug_panic!(
 383                "point {:?} extends beyond rows for string {:?}",
 384                point,
 385                self.text
 386            );
 387            return self.len();
 388        }
 389
 390        let row_offset_range = self.offset_range_for_row(point.row);
 391        if point.column > row_offset_range.len() as u32 {
 392            debug_panic!(
 393                "point {:?} extends beyond row for string {:?}",
 394                point,
 395                self.text
 396            );
 397            row_offset_range.end
 398        } else {
 399            row_offset_range.start + point.column as usize
 400        }
 401    }
 402
 403    #[track_caller]
 404    #[inline(always)]
 405    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 406        if self.is_char_boundary(offset) {
 407            return true;
 408        }
 409        if PANIC {
 410            panic_char_boundary(self.text, offset);
 411        } else {
 412            log_err_char_boundary(self.text, offset);
 413            false
 414        }
 415    }
 416
 417    pub fn floor_char_boundary(&self, index: usize) -> usize {
 418        self.text.floor_char_boundary(index)
 419    }
 420
 421    #[inline(always)]
 422    pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
 423        if point.row > self.lines().row {
 424            debug_panic!(
 425                "point {:?} extends beyond rows for string {:?}",
 426                point,
 427                self.text
 428            );
 429            return self.len_utf16();
 430        }
 431        self.offset_to_offset_utf16(self.point_to_offset(point))
 432    }
 433
 434    #[inline(always)]
 435    pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
 436        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 437        OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
 438    }
 439
 440    #[inline(always)]
 441    pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
 442        if target.0 == 0 {
 443            0
 444        } else {
 445            #[cfg(not(test))]
 446            let chars_utf16 = self.chars_utf16;
 447            #[cfg(test)]
 448            let chars_utf16 = self.chars_utf16 as u128;
 449            let ix = nth_set_bit(chars_utf16, target.0) + 1;
 450            if ix == MAX_BASE {
 451                MAX_BASE
 452            } else {
 453                let utf8_additional_len = cmp::min(
 454                    (self.chars_utf16 >> ix).trailing_zeros() as usize,
 455                    self.text.len() - ix,
 456                );
 457                ix + utf8_additional_len
 458            }
 459        }
 460    }
 461
 462    #[inline(always)]
 463    pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
 464        let mask = saturating_shl_mask(offset as u32);
 465        let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
 466        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 467        let column = if newline_ix as usize == MAX_BASE {
 468            0
 469        } else {
 470            ((self.chars_utf16 & mask) >> newline_ix).count_ones()
 471        };
 472        PointUtf16::new(row, column)
 473    }
 474
 475    #[inline(always)]
 476    pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
 477        self.offset_to_point_utf16(self.point_to_offset(point))
 478    }
 479
 480    #[inline(always)]
 481    pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
 482        let lines = self.lines();
 483        if point.row > lines.row {
 484            if !clip {
 485                debug_panic!(
 486                    "point {:?} is beyond this chunk's extent {:?}",
 487                    point,
 488                    self.text
 489                );
 490            }
 491            return self.len();
 492        }
 493
 494        let row_offset_range = self.offset_range_for_row(point.row);
 495        let line = self.slice(row_offset_range.clone());
 496        if point.column > line.last_line_len_utf16() {
 497            if !clip {
 498                debug_panic!(
 499                    "point {:?} is beyond the end of the line in chunk {:?}",
 500                    point,
 501                    self.text
 502                );
 503            }
 504            return line.len();
 505        }
 506
 507        let mut offset = row_offset_range.start;
 508        if point.column > 0 {
 509            offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
 510            if !self.text.is_char_boundary(offset) {
 511                offset -= 1;
 512                while !self.text.is_char_boundary(offset) {
 513                    offset -= 1;
 514                }
 515                if !clip {
 516                    debug_panic!(
 517                        "point {:?} is within character in chunk {:?}",
 518                        point,
 519                        self.text,
 520                    );
 521                }
 522            }
 523        }
 524        offset
 525    }
 526
 527    #[inline(always)]
 528    pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
 529        let max_point = self.lines();
 530        if point.0.row > max_point.row {
 531            return max_point;
 532        }
 533
 534        let row_offset_range = self.offset_range_for_row(point.0.row);
 535        let line = self.slice(row_offset_range);
 536        if point.0.column == 0 {
 537            Point::new(point.0.row, 0)
 538        } else if point.0.column >= line.len_utf16().0 as u32 {
 539            Point::new(point.0.row, line.len() as u32)
 540        } else {
 541            let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
 542            while !line.text.is_char_boundary(column) {
 543                column -= 1;
 544            }
 545            Point::new(point.0.row, column as u32)
 546        }
 547    }
 548
 549    #[inline(always)]
 550    pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
 551        let max_point = self.lines();
 552        if point.row > max_point.row {
 553            return max_point;
 554        }
 555
 556        let line = self.slice(self.offset_range_for_row(point.row));
 557        if point.column == 0 {
 558            point
 559        } else if point.column >= line.len() as u32 {
 560            Point::new(point.row, line.len() as u32)
 561        } else {
 562            let mut column = point.column as usize;
 563            let bytes = line.text.as_bytes();
 564            if bytes[column - 1] < 128 && bytes[column] < 128 {
 565                return Point::new(point.row, column as u32);
 566            }
 567
 568            let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
 569            loop {
 570                if line.is_char_boundary(column)
 571                    && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
 572                {
 573                    break;
 574                }
 575
 576                match bias {
 577                    Bias::Left => column -= 1,
 578                    Bias::Right => column += 1,
 579                }
 580                grapheme_cursor.set_cursor(column);
 581            }
 582            Point::new(point.row, column as u32)
 583        }
 584    }
 585
 586    #[inline(always)]
 587    pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
 588        let max_point = self.lines();
 589        if point.0.row > max_point.row {
 590            PointUtf16::new(max_point.row, self.last_line_len_utf16())
 591        } else {
 592            let line = self.slice(self.offset_range_for_row(point.0.row));
 593            let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
 594            PointUtf16::new(point.0.row, column.0 as u32)
 595        }
 596    }
 597
 598    #[inline(always)]
 599    pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
 600        if target == OffsetUtf16::default() {
 601            OffsetUtf16::default()
 602        } else if target >= self.len_utf16() {
 603            self.len_utf16()
 604        } else {
 605            let mut offset = self.offset_utf16_to_offset(target);
 606            while !self.text.is_char_boundary(offset) {
 607                if bias == Bias::Left {
 608                    offset -= 1;
 609                } else {
 610                    offset += 1;
 611                }
 612            }
 613            self.offset_to_offset_utf16(offset)
 614        }
 615    }
 616
 617    #[inline(always)]
 618    fn offset_range_for_row(&self, row: u32) -> Range<usize> {
 619        let row_start = if row > 0 {
 620            #[cfg(not(test))]
 621            let newlines = self.newlines;
 622            #[cfg(test)]
 623            let newlines = self.newlines as u128;
 624            nth_set_bit(newlines, row as usize) + 1
 625        } else {
 626            0
 627        };
 628        let row_len = if row_start == MAX_BASE {
 629            0
 630        } else {
 631            cmp::min(
 632                (self.newlines >> row_start).trailing_zeros(),
 633                (self.text.len() - row_start) as u32,
 634            )
 635        };
 636        row_start..row_start + row_len as usize
 637    }
 638
 639    #[inline(always)]
 640    pub fn tabs(&self) -> Tabs {
 641        Tabs {
 642            tabs: self.tabs,
 643            chars: self.chars,
 644        }
 645    }
 646}
 647
 648pub struct Tabs {
 649    tabs: Bitmap,
 650    chars: Bitmap,
 651}
 652
 653#[derive(Debug, PartialEq, Eq)]
 654pub struct TabPosition {
 655    pub byte_offset: usize,
 656    pub char_offset: usize,
 657}
 658
 659impl Iterator for Tabs {
 660    type Item = TabPosition;
 661
 662    fn next(&mut self) -> Option<Self::Item> {
 663        if self.tabs == 0 {
 664            return None;
 665        }
 666
 667        let tab_offset = self.tabs.trailing_zeros() as usize;
 668        let chars_mask = (1 << tab_offset) - 1;
 669        let char_offset = (self.chars & chars_mask).count_ones() as usize;
 670
 671        // Since tabs are 1 byte the tab offset is the same as the byte offset
 672        let position = TabPosition {
 673            byte_offset: tab_offset,
 674            char_offset,
 675        };
 676        // Remove the tab we've just seen
 677        self.tabs ^= 1 << tab_offset;
 678
 679        Some(position)
 680    }
 681}
 682
 683/// Finds the n-th bit that is set to 1.
 684#[inline(always)]
 685fn nth_set_bit(v: u128, n: usize) -> usize {
 686    let low = v as u64;
 687    let high = (v >> 64) as u64;
 688
 689    let low_count = low.count_ones() as usize;
 690    if n > low_count {
 691        64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
 692    } else {
 693        nth_set_bit_u64(low, n as u64) as usize
 694    }
 695}
 696
 697#[cold]
 698#[inline(never)]
 699#[track_caller]
 700fn panic_char_boundary(text: &str, offset: usize) -> ! {
 701    if offset > text.len() {
 702        panic!(
 703            "byte index {} is out of bounds of `{:?}` (length: {})",
 704            offset,
 705            text,
 706            text.len()
 707        );
 708    }
 709    // find the character
 710    let char_start = text.floor_char_boundary(offset);
 711    // `char_start` must be less than len and a char boundary
 712    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 713    let char_range = char_start..char_start + ch.len_utf8();
 714    panic!(
 715        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 716        offset, ch, char_range,
 717    );
 718}
 719
 720#[cold]
 721#[inline(never)]
 722#[track_caller]
 723fn log_err_char_boundary(text: &str, offset: usize) {
 724    if offset > text.len() {
 725        log::error!(
 726            "byte index {} is out of bounds of `{:?}` (length: {})",
 727            offset,
 728            text,
 729            text.len()
 730        );
 731    }
 732    // find the character
 733    let char_start = text.floor_char_boundary(offset);
 734    // `char_start` must be less than len and a char boundary
 735    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 736    let char_range = char_start..char_start + ch.len_utf8();
 737    log::error!(
 738        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 739        offset,
 740        ch,
 741        char_range,
 742    );
 743}
 744
 745#[inline(always)]
 746fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
 747    let v = v.reverse_bits();
 748    let mut s: u64 = 64;
 749
 750    // Parallel bit count intermediates
 751    let a = v - ((v >> 1) & (u64::MAX / 3));
 752    let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
 753    let c = (b + (b >> 4)) & (u64::MAX / 0x11);
 754    let d = (c + (c >> 8)) & (u64::MAX / 0x101);
 755
 756    // Branchless select
 757    let t = (d >> 32) + (d >> 48);
 758    s -= (t.wrapping_sub(n) & 256) >> 3;
 759    n -= t & (t.wrapping_sub(n) >> 8);
 760
 761    let t = (d >> (s - 16)) & 0xff;
 762    s -= (t.wrapping_sub(n) & 256) >> 4;
 763    n -= t & (t.wrapping_sub(n) >> 8);
 764
 765    let t = (c >> (s - 8)) & 0xf;
 766    s -= (t.wrapping_sub(n) & 256) >> 5;
 767    n -= t & (t.wrapping_sub(n) >> 8);
 768
 769    let t = (b >> (s - 4)) & 0x7;
 770    s -= (t.wrapping_sub(n) & 256) >> 6;
 771    n -= t & (t.wrapping_sub(n) >> 8);
 772
 773    let t = (a >> (s - 2)) & 0x3;
 774    s -= (t.wrapping_sub(n) & 256) >> 7;
 775    n -= t & (t.wrapping_sub(n) >> 8);
 776
 777    let t = (v >> (s - 1)) & 0x1;
 778    s -= (t.wrapping_sub(n) & 256) >> 8;
 779
 780    65 - s - 1
 781}
 782
 783#[cfg(test)]
 784mod tests {
 785    use super::*;
 786    use rand::prelude::*;
 787    use util::RandomCharIter;
 788
 789    #[gpui::test(iterations = 100)]
 790    fn test_random_chunks(mut rng: StdRng) {
 791        let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
 792        log::info!("Chunk: {:?}", text);
 793        let chunk = Chunk::new(&text);
 794        verify_chunk(chunk.as_slice(), &text);
 795
 796        // Verify Chunk::chars() bitmap
 797        let expected_chars = char_offsets(&text)
 798            .into_iter()
 799            .inspect(|i| assert!(*i < MAX_BASE))
 800            .fold(0 as Bitmap, |acc, i| acc | (1 << i));
 801        assert_eq!(chunk.chars(), expected_chars);
 802
 803        for _ in 0..10 {
 804            let mut start = rng.random_range(0..=chunk.text.len());
 805            let mut end = rng.random_range(start..=chunk.text.len());
 806            while !chunk.text.is_char_boundary(start) {
 807                start -= 1;
 808            }
 809            while !chunk.text.is_char_boundary(end) {
 810                end -= 1;
 811            }
 812            let range = start..end;
 813            log::info!("Range: {:?}", range);
 814            let text_slice = &text[range.clone()];
 815            let chunk_slice = chunk.slice(range);
 816            verify_chunk(chunk_slice, text_slice);
 817        }
 818    }
 819
 820    #[gpui::test(iterations = 100)]
 821    fn test_split_chunk_slice(mut rng: StdRng) {
 822        let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
 823        let chunk = Chunk::new(text);
 824        let offset = char_offsets_with_end(text)
 825            .into_iter()
 826            .choose(&mut rng)
 827            .unwrap();
 828        let (a, b) = chunk.as_slice().split_at(offset);
 829        let (a_str, b_str) = text.split_at(offset);
 830        verify_chunk(a, a_str);
 831        verify_chunk(b, b_str);
 832    }
 833
 834    #[gpui::test(iterations = 1000)]
 835    fn test_nth_set_bit_random(mut rng: StdRng) {
 836        let set_count = rng.random_range(0..=128);
 837        let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
 838        set_bits.sort();
 839        let mut n = 0;
 840        for ix in set_bits.iter().copied() {
 841            n |= 1 << ix;
 842        }
 843
 844        for (mut ix, position) in set_bits.into_iter().enumerate() {
 845            ix += 1;
 846            assert_eq!(
 847                nth_set_bit(n, ix),
 848                position,
 849                "nth_set_bit({:0128b}, {})",
 850                n,
 851                ix
 852            );
 853        }
 854    }
 855
 856    /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
 857    fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
 858        let mut str = String::new();
 859        let mut chars = RandomCharIter::new(rng);
 860        loop {
 861            let ch = chars.next().unwrap();
 862            if str.len() + ch.len_utf8() > len {
 863                break;
 864            }
 865            str.push(ch);
 866        }
 867        str
 868    }
 869
 870    #[gpui::test(iterations = 1000)]
 871    fn test_append_random_strings(mut rng: StdRng) {
 872        let len1 = rng.random_range(0..=MAX_BASE);
 873        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 874        let str1 = random_string_with_utf8_len(&mut rng, len1);
 875        let str2 = random_string_with_utf8_len(&mut rng, len2);
 876        let mut chunk1 = Chunk::new(&str1);
 877        let chunk2 = Chunk::new(&str2);
 878        let char_offsets = char_offsets_with_end(&str2);
 879        let start_index = rng.random_range(0..char_offsets.len());
 880        let start_offset = char_offsets[start_index];
 881        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 882        chunk1.append(chunk2.slice(start_offset..end_offset));
 883        verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
 884    }
 885
 886    /// Return the byte offsets for each character in a string.
 887    ///
 888    /// These are valid offsets to split the string.
 889    fn char_offsets(text: &str) -> Vec<usize> {
 890        text.char_indices().map(|(i, _c)| i).collect()
 891    }
 892
 893    /// Return the byte offsets for each character in a string, plus the offset
 894    /// past the end of the string.
 895    fn char_offsets_with_end(text: &str) -> Vec<usize> {
 896        let mut v = char_offsets(text);
 897        v.push(text.len());
 898        v
 899    }
 900
 901    fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
 902        let mut offset = 0;
 903        let mut offset_utf16 = OffsetUtf16(0);
 904        let mut point = Point::zero();
 905        let mut point_utf16 = PointUtf16::zero();
 906
 907        log::info!("Verifying chunk {:?}", text);
 908        assert_eq!(chunk.offset_to_point(0), Point::zero());
 909
 910        let mut expected_tab_positions = Vec::new();
 911
 912        for (char_offset, c) in text.chars().enumerate() {
 913            let expected_point = chunk.offset_to_point(offset);
 914            assert_eq!(point, expected_point, "mismatch at offset {}", offset);
 915            assert_eq!(
 916                chunk.point_to_offset(point),
 917                offset,
 918                "mismatch at point {:?}",
 919                point
 920            );
 921            assert_eq!(
 922                chunk.offset_to_offset_utf16(offset),
 923                offset_utf16,
 924                "mismatch at offset {}",
 925                offset
 926            );
 927            assert_eq!(
 928                chunk.offset_utf16_to_offset(offset_utf16),
 929                offset,
 930                "mismatch at offset_utf16 {:?}",
 931                offset_utf16
 932            );
 933            assert_eq!(
 934                chunk.point_to_point_utf16(point),
 935                point_utf16,
 936                "mismatch at point {:?}",
 937                point
 938            );
 939            assert_eq!(
 940                chunk.point_utf16_to_offset(point_utf16, false),
 941                offset,
 942                "mismatch at point_utf16 {:?}",
 943                point_utf16
 944            );
 945            assert_eq!(
 946                chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
 947                point,
 948                "mismatch for unclipped_point_utf16_to_point at {:?}",
 949                point_utf16
 950            );
 951
 952            assert_eq!(
 953                chunk.clip_point(point, Bias::Left),
 954                point,
 955                "incorrect left clip at {:?}",
 956                point
 957            );
 958            assert_eq!(
 959                chunk.clip_point(point, Bias::Right),
 960                point,
 961                "incorrect right clip at {:?}",
 962                point
 963            );
 964
 965            for i in 1..c.len_utf8() {
 966                let test_point = Point::new(point.row, point.column + i as u32);
 967                assert_eq!(
 968                    chunk.clip_point(test_point, Bias::Left),
 969                    point,
 970                    "incorrect left clip within multi-byte char at {:?}",
 971                    test_point
 972                );
 973                assert_eq!(
 974                    chunk.clip_point(test_point, Bias::Right),
 975                    Point::new(point.row, point.column + c.len_utf8() as u32),
 976                    "incorrect right clip within multi-byte char at {:?}",
 977                    test_point
 978                );
 979            }
 980
 981            for i in 1..c.len_utf16() {
 982                let test_point = Unclipped(PointUtf16::new(
 983                    point_utf16.row,
 984                    point_utf16.column + i as u32,
 985                ));
 986                assert_eq!(
 987                    chunk.unclipped_point_utf16_to_point(test_point),
 988                    point,
 989                    "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
 990                    test_point
 991                );
 992                assert_eq!(
 993                    chunk.clip_point_utf16(test_point, Bias::Left),
 994                    point_utf16,
 995                    "incorrect left clip_point_utf16 within multi-byte char at {:?}",
 996                    test_point
 997                );
 998                assert_eq!(
 999                    chunk.clip_point_utf16(test_point, Bias::Right),
1000                    PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1001                    "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1002                    test_point
1003                );
1004
1005                let test_offset = OffsetUtf16(offset_utf16.0 + i);
1006                assert_eq!(
1007                    chunk.clip_offset_utf16(test_offset, Bias::Left),
1008                    offset_utf16,
1009                    "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1010                    test_offset
1011                );
1012                assert_eq!(
1013                    chunk.clip_offset_utf16(test_offset, Bias::Right),
1014                    OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1015                    "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1016                    test_offset
1017                );
1018            }
1019
1020            if c == '\n' {
1021                point.row += 1;
1022                point.column = 0;
1023                point_utf16.row += 1;
1024                point_utf16.column = 0;
1025            } else {
1026                point.column += c.len_utf8() as u32;
1027                point_utf16.column += c.len_utf16() as u32;
1028            }
1029
1030            if c == '\t' {
1031                expected_tab_positions.push(TabPosition {
1032                    byte_offset: offset,
1033                    char_offset,
1034                });
1035            }
1036
1037            offset += c.len_utf8();
1038            offset_utf16.0 += c.len_utf16();
1039        }
1040
1041        let final_point = chunk.offset_to_point(offset);
1042        assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1043        assert_eq!(
1044            chunk.point_to_offset(point),
1045            offset,
1046            "mismatch at point {:?}",
1047            point
1048        );
1049        assert_eq!(
1050            chunk.offset_to_offset_utf16(offset),
1051            offset_utf16,
1052            "mismatch at offset {}",
1053            offset
1054        );
1055        assert_eq!(
1056            chunk.offset_utf16_to_offset(offset_utf16),
1057            offset,
1058            "mismatch at offset_utf16 {:?}",
1059            offset_utf16
1060        );
1061        assert_eq!(
1062            chunk.point_to_point_utf16(point),
1063            point_utf16,
1064            "mismatch at final point {:?}",
1065            point
1066        );
1067        assert_eq!(
1068            chunk.point_utf16_to_offset(point_utf16, false),
1069            offset,
1070            "mismatch at final point_utf16 {:?}",
1071            point_utf16
1072        );
1073        assert_eq!(
1074            chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1075            point,
1076            "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1077            point_utf16
1078        );
1079        assert_eq!(
1080            chunk.clip_point(point, Bias::Left),
1081            point,
1082            "incorrect left clip at final point {:?}",
1083            point
1084        );
1085        assert_eq!(
1086            chunk.clip_point(point, Bias::Right),
1087            point,
1088            "incorrect right clip at final point {:?}",
1089            point
1090        );
1091        assert_eq!(
1092            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1093            point_utf16,
1094            "incorrect left clip_point_utf16 at final point {:?}",
1095            point_utf16
1096        );
1097        assert_eq!(
1098            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1099            point_utf16,
1100            "incorrect right clip_point_utf16 at final point {:?}",
1101            point_utf16
1102        );
1103        assert_eq!(
1104            chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1105            offset_utf16,
1106            "incorrect left clip_offset_utf16 at final offset {:?}",
1107            offset_utf16
1108        );
1109        assert_eq!(
1110            chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1111            offset_utf16,
1112            "incorrect right clip_offset_utf16 at final offset {:?}",
1113            offset_utf16
1114        );
1115
1116        // Verify length methods
1117        assert_eq!(chunk.len(), text.len());
1118        assert_eq!(
1119            chunk.len_utf16().0,
1120            text.chars().map(|c| c.len_utf16()).sum::<usize>()
1121        );
1122
1123        // Verify line counting
1124        let lines = chunk.lines();
1125        let mut newline_count = 0;
1126        let mut last_line_len = 0;
1127        for c in text.chars() {
1128            if c == '\n' {
1129                newline_count += 1;
1130                last_line_len = 0;
1131            } else {
1132                last_line_len += c.len_utf8() as u32;
1133            }
1134        }
1135        assert_eq!(lines, Point::new(newline_count, last_line_len));
1136
1137        // Verify first/last line chars
1138        if !text.is_empty() {
1139            let first_line = text.split('\n').next().unwrap();
1140            assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1141
1142            let last_line = text.split('\n').next_back().unwrap();
1143            assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1144            assert_eq!(
1145                chunk.last_line_len_utf16(),
1146                last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1147            );
1148        }
1149
1150        // Verify longest row
1151        let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1152        let mut max_chars = 0;
1153        let mut current_row = 0;
1154        let mut current_chars = 0;
1155        let mut max_row = 0;
1156
1157        for c in text.chars() {
1158            if c == '\n' {
1159                if current_chars > max_chars {
1160                    max_chars = current_chars;
1161                    max_row = current_row;
1162                }
1163                current_row += 1;
1164                current_chars = 0;
1165            } else {
1166                current_chars += 1;
1167            }
1168        }
1169
1170        if current_chars > max_chars {
1171            max_chars = current_chars;
1172            max_row = current_row;
1173        }
1174
1175        assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1176        assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1177    }
1178}