chunk.rs

   1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
   2use heapless::String as ArrayString;
   3use std::{cmp, ops::Range};
   4use sum_tree::Bias;
   5use unicode_segmentation::GraphemeCursor;
   6use util::debug_panic;
   7
   8#[cfg(not(all(test, not(rust_analyzer))))]
   9pub(crate) type Bitmap = u128;
  10#[cfg(all(test, not(rust_analyzer)))]
  11pub(crate) type Bitmap = u16;
  12
  13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
  14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
  15
  16#[derive(Clone, Debug, Default)]
  17pub struct Chunk {
  18    /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
  19    /// text.
  20    chars: Bitmap,
  21    /// The number of set bits is the number of UTF-16 code units it would take to represent the
  22    /// text.
  23    ///
  24    /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
  25    /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
  26    /// than two UTF-16 code units.)
  27    chars_utf16: Bitmap,
  28    /// If bit[i] is set, then the character at index i is an ascii newline.
  29    newlines: Bitmap,
  30    /// If bit[i] is set, then the character at index i is an ascii tab.
  31    tabs: Bitmap,
  32    pub text: ArrayString<MAX_BASE, u8>,
  33}
  34
  35#[inline(always)]
  36const fn saturating_shl_mask(offset: u32) -> Bitmap {
  37    (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
  38}
  39
  40#[inline(always)]
  41const fn saturating_shr_mask(offset: u32) -> Bitmap {
  42    !Bitmap::MAX.unbounded_shr(offset)
  43}
  44
  45impl Chunk {
  46    pub const MASK_BITS: usize = Bitmap::BITS as usize;
  47
  48    #[inline(always)]
  49    pub fn new(text: &str) -> Self {
  50        let text = {
  51            let mut buf = ArrayString::new();
  52            buf.push_str(text).unwrap();
  53            buf
  54        };
  55
  56        const CHUNK_SIZE: usize = 8;
  57
  58        let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
  59        let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
  60        let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
  61        let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
  62
  63        let mut chunk_ix = 0;
  64
  65        let mut bytes = text.as_bytes();
  66        while !bytes.is_empty() {
  67            let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
  68            bytes = rest;
  69
  70            let mut chars = 0;
  71            let mut newlines = 0;
  72            let mut tabs = 0;
  73            let mut chars_utf16 = 0;
  74
  75            for (ix, &b) in chunk.iter().enumerate() {
  76                chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
  77                newlines |= ((b == b'\n') as u8) << ix;
  78                tabs |= ((b == b'\t') as u8) << ix;
  79                // b >= 240 when we are at the first byte of the 4 byte encoded
  80                // utf-8 code point (U+010000 or greater) it means that it would
  81                // be encoded as two 16-bit code units in utf-16
  82                chars_utf16 |= ((b >= 240) as u8) << ix;
  83            }
  84
  85            chars_bytes[chunk_ix] = chars;
  86            newlines_bytes[chunk_ix] = newlines;
  87            tabs_bytes[chunk_ix] = tabs;
  88            chars_utf16_bytes[chunk_ix] = chars_utf16;
  89
  90            chunk_ix += 1;
  91        }
  92
  93        let chars = Bitmap::from_le_bytes(chars_bytes);
  94
  95        Chunk {
  96            text,
  97            chars,
  98            chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
  99            newlines: Bitmap::from_le_bytes(newlines_bytes),
 100            tabs: Bitmap::from_le_bytes(tabs_bytes),
 101        }
 102    }
 103
 104    #[inline(always)]
 105    pub fn push_str(&mut self, text: &str) {
 106        self.append(Chunk::new(text).as_slice());
 107    }
 108
 109    #[inline(always)]
 110    pub fn prepend_str(&mut self, text: &str) {
 111        self.prepend(Chunk::new(text).as_slice());
 112    }
 113
 114    #[inline(always)]
 115    pub fn append(&mut self, slice: ChunkSlice) {
 116        if slice.is_empty() {
 117            return;
 118        };
 119
 120        let base_ix = self.text.len();
 121        self.chars |= slice.chars << base_ix;
 122        self.chars_utf16 |= slice.chars_utf16 << base_ix;
 123        self.newlines |= slice.newlines << base_ix;
 124        self.tabs |= slice.tabs << base_ix;
 125        self.text.push_str(slice.text).unwrap();
 126    }
 127
 128    #[inline(always)]
 129    pub fn prepend(&mut self, slice: ChunkSlice) {
 130        if slice.is_empty() {
 131            return;
 132        }
 133        if self.text.is_empty() {
 134            *self = Chunk::new(slice.text);
 135            return;
 136        }
 137
 138        let shift = slice.text.len();
 139        self.chars = slice.chars | (self.chars << shift);
 140        self.chars_utf16 = slice.chars_utf16 | (self.chars_utf16 << shift);
 141        self.newlines = slice.newlines | (self.newlines << shift);
 142        self.tabs = slice.tabs | (self.tabs << shift);
 143
 144        let mut new_text = ArrayString::<MAX_BASE, u8>::new();
 145        new_text.push_str(slice.text).unwrap();
 146        new_text.push_str(&self.text).unwrap();
 147        self.text = new_text;
 148    }
 149
 150    #[inline(always)]
 151    pub fn as_slice(&self) -> ChunkSlice<'_> {
 152        ChunkSlice {
 153            chars: self.chars,
 154            chars_utf16: self.chars_utf16,
 155            newlines: self.newlines,
 156            tabs: self.tabs,
 157            text: &self.text,
 158        }
 159    }
 160
 161    #[inline(always)]
 162    pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
 163        self.as_slice().slice(range)
 164    }
 165
 166    #[inline(always)]
 167    pub fn chars(&self) -> Bitmap {
 168        self.chars
 169    }
 170
 171    #[inline(always)]
 172    pub fn tabs(&self) -> Bitmap {
 173        self.tabs
 174    }
 175
 176    #[inline(always)]
 177    pub fn newlines(&self) -> Bitmap {
 178        self.newlines
 179    }
 180
 181    #[inline(always)]
 182    pub fn is_char_boundary(&self, offset: usize) -> bool {
 183        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 184    }
 185
 186    pub fn floor_char_boundary(&self, index: usize) -> usize {
 187        if index >= self.text.len() {
 188            self.text.len()
 189        } else {
 190            let mut i = index;
 191            while i > 0 {
 192                if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
 193                    break;
 194                }
 195                i -= 1;
 196            }
 197
 198            i
 199        }
 200    }
 201
 202    #[track_caller]
 203    #[inline(always)]
 204    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 205        if self.is_char_boundary(offset) {
 206            return true;
 207        }
 208        if PANIC || cfg!(debug_assertions) {
 209            panic_char_boundary(&self.text, offset);
 210        } else {
 211            log_err_char_boundary(&self.text, offset);
 212            false
 213        }
 214    }
 215}
 216
 217#[derive(Clone, Copy, Debug)]
 218pub struct ChunkSlice<'a> {
 219    chars: Bitmap,
 220    chars_utf16: Bitmap,
 221    newlines: Bitmap,
 222    tabs: Bitmap,
 223    text: &'a str,
 224}
 225
 226impl Into<Chunk> for ChunkSlice<'_> {
 227    fn into(self) -> Chunk {
 228        Chunk {
 229            chars: self.chars,
 230            chars_utf16: self.chars_utf16,
 231            newlines: self.newlines,
 232            tabs: self.tabs,
 233            text: self.text.try_into().unwrap(),
 234        }
 235    }
 236}
 237
 238impl<'a> ChunkSlice<'a> {
 239    #[inline(always)]
 240    pub fn is_empty(&self) -> bool {
 241        self.text.is_empty()
 242    }
 243
 244    #[inline(always)]
 245    pub fn is_char_boundary(&self, offset: usize) -> bool {
 246        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 247    }
 248
 249    #[inline(always)]
 250    pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
 251        if mid == MAX_BASE {
 252            let left = self;
 253            let right = ChunkSlice {
 254                chars: 0,
 255                chars_utf16: 0,
 256                newlines: 0,
 257                tabs: 0,
 258                text: "",
 259            };
 260            (left, right)
 261        } else {
 262            let mask = ((1 as Bitmap) << mid) - 1;
 263            let (left_text, right_text) = self.text.split_at(mid);
 264            let left = ChunkSlice {
 265                chars: self.chars & mask,
 266                chars_utf16: self.chars_utf16 & mask,
 267                newlines: self.newlines & mask,
 268                tabs: self.tabs & mask,
 269                text: left_text,
 270            };
 271            let right = ChunkSlice {
 272                chars: self.chars >> mid,
 273                chars_utf16: self.chars_utf16 >> mid,
 274                newlines: self.newlines >> mid,
 275                tabs: self.tabs >> mid,
 276                text: right_text,
 277            };
 278            (left, right)
 279        }
 280    }
 281
 282    #[inline(always)]
 283    pub fn slice(self, mut range: Range<usize>) -> Self {
 284        if range.start == MAX_BASE {
 285            Self {
 286                chars: 0,
 287                chars_utf16: 0,
 288                newlines: 0,
 289                tabs: 0,
 290                text: "",
 291            }
 292        } else {
 293            if !self.assert_char_boundary::<false>(range.start) {
 294                range.start = self.text.ceil_char_boundary(range.start);
 295            }
 296            if !self.assert_char_boundary::<false>(range.end) {
 297                range.end = if range.end < range.start {
 298                    range.start
 299                } else {
 300                    self.text.floor_char_boundary(range.end)
 301                };
 302            }
 303            let mask = (1 as Bitmap)
 304                .unbounded_shl(range.end as u32)
 305                .wrapping_sub(1);
 306            Self {
 307                chars: (self.chars & mask) >> range.start,
 308                chars_utf16: (self.chars_utf16 & mask) >> range.start,
 309                newlines: (self.newlines & mask) >> range.start,
 310                tabs: (self.tabs & mask) >> range.start,
 311                text: &self.text[range],
 312            }
 313        }
 314    }
 315
 316    #[inline(always)]
 317    pub fn text_summary(&self) -> TextSummary {
 318        let mut chars = 0;
 319        let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
 320        TextSummary {
 321            len: self.len(),
 322            chars,
 323            len_utf16: self.len_utf16(),
 324            lines: self.lines(),
 325            first_line_chars: self.first_line_chars(),
 326            last_line_chars: self.last_line_chars(),
 327            last_line_len_utf16: self.last_line_len_utf16(),
 328            longest_row,
 329            longest_row_chars,
 330        }
 331    }
 332
 333    /// Get length in bytes
 334    #[inline(always)]
 335    pub fn len(&self) -> usize {
 336        self.text.len()
 337    }
 338
 339    /// Get length in UTF-16 code units
 340    #[inline(always)]
 341    pub fn len_utf16(&self) -> OffsetUtf16 {
 342        OffsetUtf16(self.chars_utf16.count_ones() as usize)
 343    }
 344
 345    /// Get point representing number of lines and length of last line
 346    #[inline(always)]
 347    pub fn lines(&self) -> Point {
 348        let row = self.newlines.count_ones();
 349        let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
 350        Point::new(row, column)
 351    }
 352
 353    /// Get number of chars in first line
 354    #[inline(always)]
 355    pub fn first_line_chars(&self) -> u32 {
 356        (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
 357    }
 358
 359    /// Get number of chars in last line
 360    #[inline(always)]
 361    pub fn last_line_chars(&self) -> u32 {
 362        (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 363    }
 364
 365    /// Get number of UTF-16 code units in last line
 366    #[inline(always)]
 367    pub fn last_line_len_utf16(&self) -> u32 {
 368        (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 369    }
 370
 371    /// Get the longest row in the chunk and its length in characters.
 372    /// Calculate the total number of characters in the chunk along the way.
 373    #[inline(always)]
 374    pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
 375        let mut chars = self.chars;
 376        let mut newlines = self.newlines;
 377        *total_chars = 0;
 378        let mut row = 0;
 379        let mut longest_row = 0;
 380        let mut longest_row_chars = 0;
 381        while newlines > 0 {
 382            let newline_ix = newlines.trailing_zeros();
 383            let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
 384            *total_chars += usize::from(row_chars);
 385            if row_chars > longest_row_chars {
 386                longest_row = row;
 387                longest_row_chars = row_chars;
 388            }
 389
 390            newlines >>= newline_ix;
 391            newlines >>= 1;
 392            chars >>= newline_ix;
 393            chars >>= 1;
 394            row += 1;
 395            *total_chars += 1;
 396        }
 397
 398        let row_chars = chars.count_ones() as u8;
 399        *total_chars += usize::from(row_chars);
 400        if row_chars > longest_row_chars {
 401            (row, row_chars as u32)
 402        } else {
 403            (longest_row, longest_row_chars as u32)
 404        }
 405    }
 406
 407    #[inline(always)]
 408    pub fn offset_to_point(&self, offset: usize) -> Point {
 409        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 410        let row = (self.newlines & mask).count_ones();
 411        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 412        let column = (offset - newline_ix as usize) as u32;
 413        Point::new(row, column)
 414    }
 415
 416    #[inline(always)]
 417    pub fn point_to_offset(&self, point: Point) -> usize {
 418        if point.row > self.lines().row {
 419            debug_panic!(
 420                "point {:?} extends beyond rows for string {:?}",
 421                point,
 422                self.text
 423            );
 424            return self.len();
 425        }
 426
 427        let row_offset_range = self.offset_range_for_row(point.row);
 428        if point.column > row_offset_range.len() as u32 {
 429            debug_panic!(
 430                "point {:?} extends beyond row for string {:?}",
 431                point,
 432                self.text
 433            );
 434            row_offset_range.end
 435        } else {
 436            row_offset_range.start + point.column as usize
 437        }
 438    }
 439
 440    #[track_caller]
 441    #[inline(always)]
 442    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 443        if self.is_char_boundary(offset) {
 444            return true;
 445        }
 446        if PANIC {
 447            panic_char_boundary(self.text, offset);
 448        } else {
 449            log_err_char_boundary(self.text, offset);
 450            false
 451        }
 452    }
 453
 454    pub fn floor_char_boundary(&self, index: usize) -> usize {
 455        self.text.floor_char_boundary(index)
 456    }
 457
 458    #[inline(always)]
 459    pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
 460        if point.row > self.lines().row {
 461            debug_panic!(
 462                "point {:?} extends beyond rows for string {:?}",
 463                point,
 464                self.text
 465            );
 466            return self.len_utf16();
 467        }
 468        self.offset_to_offset_utf16(self.point_to_offset(point))
 469    }
 470
 471    #[inline(always)]
 472    pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
 473        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 474        OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
 475    }
 476
 477    #[inline(always)]
 478    pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
 479        if target.0 == 0 {
 480            0
 481        } else {
 482            #[cfg(not(test))]
 483            let chars_utf16 = self.chars_utf16;
 484            #[cfg(test)]
 485            let chars_utf16 = self.chars_utf16 as u128;
 486            let ix = nth_set_bit(chars_utf16, target.0) + 1;
 487            if ix == MAX_BASE {
 488                MAX_BASE
 489            } else {
 490                let utf8_additional_len = cmp::min(
 491                    (self.chars_utf16 >> ix).trailing_zeros() as usize,
 492                    self.text.len() - ix,
 493                );
 494                ix + utf8_additional_len
 495            }
 496        }
 497    }
 498
 499    #[inline(always)]
 500    pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
 501        let mask = saturating_shl_mask(offset as u32);
 502        let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
 503        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 504        let column = if newline_ix as usize == MAX_BASE {
 505            0
 506        } else {
 507            ((self.chars_utf16 & mask) >> newline_ix).count_ones()
 508        };
 509        PointUtf16::new(row, column)
 510    }
 511
 512    #[inline(always)]
 513    pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
 514        self.offset_to_point_utf16(self.point_to_offset(point))
 515    }
 516
 517    #[inline(always)]
 518    pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
 519        let lines = self.lines();
 520        if point.row > lines.row {
 521            if !clip {
 522                debug_panic!(
 523                    "point {:?} is beyond this chunk's extent {:?}",
 524                    point,
 525                    self.text
 526                );
 527            }
 528            return self.len();
 529        }
 530
 531        let row_offset_range = self.offset_range_for_row(point.row);
 532        let line = self.slice(row_offset_range.clone());
 533        if point.column > line.last_line_len_utf16() {
 534            if !clip {
 535                debug_panic!(
 536                    "point {:?} is beyond the end of the line in chunk {:?}",
 537                    point,
 538                    self.text
 539                );
 540            }
 541            return row_offset_range.end;
 542        }
 543
 544        let mut offset = row_offset_range.start;
 545        if point.column > 0 {
 546            offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
 547            if !self.text.is_char_boundary(offset) {
 548                offset -= 1;
 549                while !self.text.is_char_boundary(offset) {
 550                    offset -= 1;
 551                }
 552                if !clip {
 553                    debug_panic!(
 554                        "point {:?} is within character in chunk {:?}",
 555                        point,
 556                        self.text,
 557                    );
 558                }
 559            }
 560        }
 561        offset
 562    }
 563
 564    #[inline(always)]
 565    pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
 566        let max_point = self.lines();
 567        if point.0.row > max_point.row {
 568            return max_point;
 569        }
 570
 571        let row_offset_range = self.offset_range_for_row(point.0.row);
 572        let line = self.slice(row_offset_range);
 573        if point.0.column == 0 {
 574            Point::new(point.0.row, 0)
 575        } else if point.0.column >= line.len_utf16().0 as u32 {
 576            Point::new(point.0.row, line.len() as u32)
 577        } else {
 578            let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
 579            while !line.text.is_char_boundary(column) {
 580                column -= 1;
 581            }
 582            Point::new(point.0.row, column as u32)
 583        }
 584    }
 585
 586    #[inline(always)]
 587    pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
 588        let max_point = self.lines();
 589        if point.row > max_point.row {
 590            return max_point;
 591        }
 592
 593        let line = self.slice(self.offset_range_for_row(point.row));
 594        if point.column == 0 {
 595            point
 596        } else if point.column >= line.len() as u32 {
 597            Point::new(point.row, line.len() as u32)
 598        } else {
 599            let mut column = point.column as usize;
 600            let bytes = line.text.as_bytes();
 601            if bytes[column - 1] < 128 && bytes[column] < 128 {
 602                return Point::new(point.row, column as u32);
 603            }
 604
 605            let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
 606            loop {
 607                if line.is_char_boundary(column)
 608                    && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
 609                {
 610                    break;
 611                }
 612
 613                match bias {
 614                    Bias::Left => column -= 1,
 615                    Bias::Right => column += 1,
 616                }
 617                grapheme_cursor.set_cursor(column);
 618            }
 619            Point::new(point.row, column as u32)
 620        }
 621    }
 622
 623    #[inline(always)]
 624    pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
 625        let max_point = self.lines();
 626        if point.0.row > max_point.row {
 627            PointUtf16::new(max_point.row, self.last_line_len_utf16())
 628        } else {
 629            let line = self.slice(self.offset_range_for_row(point.0.row));
 630            let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
 631            PointUtf16::new(point.0.row, column.0 as u32)
 632        }
 633    }
 634
 635    #[inline(always)]
 636    pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
 637        if target == OffsetUtf16::default() {
 638            OffsetUtf16::default()
 639        } else if target >= self.len_utf16() {
 640            self.len_utf16()
 641        } else {
 642            let mut offset = self.offset_utf16_to_offset(target);
 643            while !self.text.is_char_boundary(offset) {
 644                if bias == Bias::Left {
 645                    offset -= 1;
 646                } else {
 647                    offset += 1;
 648                }
 649            }
 650            self.offset_to_offset_utf16(offset)
 651        }
 652    }
 653
 654    #[inline(always)]
 655    fn offset_range_for_row(&self, row: u32) -> Range<usize> {
 656        let row_start = if row > 0 {
 657            #[cfg(not(test))]
 658            let newlines = self.newlines;
 659            #[cfg(test)]
 660            let newlines = self.newlines as u128;
 661            nth_set_bit(newlines, row as usize) + 1
 662        } else {
 663            0
 664        };
 665        let row_len = if row_start == MAX_BASE {
 666            0
 667        } else {
 668            cmp::min(
 669                (self.newlines >> row_start).trailing_zeros(),
 670                (self.text.len() - row_start) as u32,
 671            )
 672        };
 673        row_start..row_start + row_len as usize
 674    }
 675
 676    #[inline(always)]
 677    pub fn tabs(&self) -> Tabs {
 678        Tabs {
 679            tabs: self.tabs,
 680            chars: self.chars,
 681        }
 682    }
 683}
 684
 685pub struct Tabs {
 686    tabs: Bitmap,
 687    chars: Bitmap,
 688}
 689
 690#[derive(Debug, PartialEq, Eq)]
 691pub struct TabPosition {
 692    pub byte_offset: usize,
 693    pub char_offset: usize,
 694}
 695
 696impl Iterator for Tabs {
 697    type Item = TabPosition;
 698
 699    fn next(&mut self) -> Option<Self::Item> {
 700        if self.tabs == 0 {
 701            return None;
 702        }
 703
 704        let tab_offset = self.tabs.trailing_zeros() as usize;
 705        let chars_mask = (1 << tab_offset) - 1;
 706        let char_offset = (self.chars & chars_mask).count_ones() as usize;
 707
 708        // Since tabs are 1 byte the tab offset is the same as the byte offset
 709        let position = TabPosition {
 710            byte_offset: tab_offset,
 711            char_offset,
 712        };
 713        // Remove the tab we've just seen
 714        self.tabs ^= 1 << tab_offset;
 715
 716        Some(position)
 717    }
 718}
 719
 720/// Finds the n-th bit that is set to 1.
 721#[inline(always)]
 722fn nth_set_bit(v: u128, n: usize) -> usize {
 723    let low = v as u64;
 724    let high = (v >> 64) as u64;
 725
 726    let low_count = low.count_ones() as usize;
 727    if n > low_count {
 728        64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
 729    } else {
 730        nth_set_bit_u64(low, n as u64) as usize
 731    }
 732}
 733
 734#[cold]
 735#[inline(never)]
 736#[track_caller]
 737fn panic_char_boundary(text: &str, offset: usize) -> ! {
 738    if offset > text.len() {
 739        panic!(
 740            "byte index {} is out of bounds of `{:?}` (length: {})",
 741            offset,
 742            text,
 743            text.len()
 744        );
 745    }
 746    // find the character
 747    let char_start = text.floor_char_boundary(offset);
 748    // `char_start` must be less than len and a char boundary
 749    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 750    let char_range = char_start..char_start + ch.len_utf8();
 751    panic!(
 752        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 753        offset, ch, char_range,
 754    );
 755}
 756
 757#[cold]
 758#[inline(never)]
 759#[track_caller]
 760fn log_err_char_boundary(text: &str, offset: usize) {
 761    if offset >= text.len() {
 762        log::error!(
 763            "byte index {} is out of bounds of `{:?}` (length: {})",
 764            offset,
 765            text,
 766            text.len()
 767        );
 768        return;
 769    }
 770    // find the character
 771    let char_start = text.floor_char_boundary(offset);
 772    // `char_start` must be less than len and a char boundary
 773    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 774    let char_range = char_start..char_start + ch.len_utf8();
 775    log::error!(
 776        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 777        offset,
 778        ch,
 779        char_range,
 780    );
 781}
 782
 783#[inline(always)]
 784fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
 785    let v = v.reverse_bits();
 786    let mut s: u64 = 64;
 787
 788    // Parallel bit count intermediates
 789    let a = v - ((v >> 1) & (u64::MAX / 3));
 790    let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
 791    let c = (b + (b >> 4)) & (u64::MAX / 0x11);
 792    let d = (c + (c >> 8)) & (u64::MAX / 0x101);
 793
 794    // Branchless select
 795    let t = (d >> 32) + (d >> 48);
 796    s -= (t.wrapping_sub(n) & 256) >> 3;
 797    n -= t & (t.wrapping_sub(n) >> 8);
 798
 799    let t = (d >> (s - 16)) & 0xff;
 800    s -= (t.wrapping_sub(n) & 256) >> 4;
 801    n -= t & (t.wrapping_sub(n) >> 8);
 802
 803    let t = (c >> (s - 8)) & 0xf;
 804    s -= (t.wrapping_sub(n) & 256) >> 5;
 805    n -= t & (t.wrapping_sub(n) >> 8);
 806
 807    let t = (b >> (s - 4)) & 0x7;
 808    s -= (t.wrapping_sub(n) & 256) >> 6;
 809    n -= t & (t.wrapping_sub(n) >> 8);
 810
 811    let t = (a >> (s - 2)) & 0x3;
 812    s -= (t.wrapping_sub(n) & 256) >> 7;
 813    n -= t & (t.wrapping_sub(n) >> 8);
 814
 815    let t = (v >> (s - 1)) & 0x1;
 816    s -= (t.wrapping_sub(n) & 256) >> 8;
 817
 818    65 - s - 1
 819}
 820
 821#[cfg(test)]
 822mod tests {
 823    use super::*;
 824    use rand::prelude::*;
 825    use util::RandomCharIter;
 826
 827    #[gpui::test(iterations = 100)]
 828    fn test_random_chunks(mut rng: StdRng) {
 829        let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
 830        log::info!("Chunk: {:?}", text);
 831        let chunk = Chunk::new(&text);
 832        verify_chunk(chunk.as_slice(), &text);
 833
 834        // Verify Chunk::chars() bitmap
 835        let expected_chars = char_offsets(&text)
 836            .into_iter()
 837            .inspect(|i| assert!(*i < MAX_BASE))
 838            .fold(0 as Bitmap, |acc, i| acc | (1 << i));
 839        assert_eq!(chunk.chars(), expected_chars);
 840
 841        for _ in 0..10 {
 842            let mut start = rng.random_range(0..=chunk.text.len());
 843            let mut end = rng.random_range(start..=chunk.text.len());
 844            while !chunk.text.is_char_boundary(start) {
 845                start -= 1;
 846            }
 847            while !chunk.text.is_char_boundary(end) {
 848                end -= 1;
 849            }
 850            let range = start..end;
 851            log::info!("Range: {:?}", range);
 852            let text_slice = &text[range.clone()];
 853            let chunk_slice = chunk.slice(range);
 854            verify_chunk(chunk_slice, text_slice);
 855        }
 856    }
 857
 858    #[gpui::test(iterations = 100)]
 859    fn test_split_chunk_slice(mut rng: StdRng) {
 860        let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
 861        let chunk = Chunk::new(text);
 862        let offset = char_offsets_with_end(text)
 863            .into_iter()
 864            .choose(&mut rng)
 865            .unwrap();
 866        let (a, b) = chunk.as_slice().split_at(offset);
 867        let (a_str, b_str) = text.split_at(offset);
 868        verify_chunk(a, a_str);
 869        verify_chunk(b, b_str);
 870    }
 871
 872    #[gpui::test(iterations = 1000)]
 873    fn test_nth_set_bit_random(mut rng: StdRng) {
 874        let set_count = rng.random_range(0..=128);
 875        let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
 876        set_bits.sort();
 877        let mut n = 0;
 878        for ix in set_bits.iter().copied() {
 879            n |= 1 << ix;
 880        }
 881
 882        for (mut ix, position) in set_bits.into_iter().enumerate() {
 883            ix += 1;
 884            assert_eq!(
 885                nth_set_bit(n, ix),
 886                position,
 887                "nth_set_bit({:0128b}, {})",
 888                n,
 889                ix
 890            );
 891        }
 892    }
 893
 894    /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
 895    fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
 896        let mut str = String::new();
 897        let mut chars = RandomCharIter::new(rng);
 898        loop {
 899            let ch = chars.next().unwrap();
 900            if str.len() + ch.len_utf8() > len {
 901                break;
 902            }
 903            str.push(ch);
 904        }
 905        str
 906    }
 907
 908    #[gpui::test(iterations = 1000)]
 909    fn test_append_random_strings(mut rng: StdRng) {
 910        let len1 = rng.random_range(0..=MAX_BASE);
 911        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 912        let str1 = random_string_with_utf8_len(&mut rng, len1);
 913        let str2 = random_string_with_utf8_len(&mut rng, len2);
 914        let mut chunk1 = Chunk::new(&str1);
 915        let chunk2 = Chunk::new(&str2);
 916        let char_offsets = char_offsets_with_end(&str2);
 917        let start_index = rng.random_range(0..char_offsets.len());
 918        let start_offset = char_offsets[start_index];
 919        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 920        chunk1.append(chunk2.slice(start_offset..end_offset));
 921        verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
 922    }
 923
 924    #[gpui::test(iterations = 1000)]
 925    fn test_prepend_random_strings(mut rng: StdRng) {
 926        let len1 = rng.random_range(0..=MAX_BASE);
 927        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 928        let str1 = random_string_with_utf8_len(&mut rng, len1);
 929        let str2 = random_string_with_utf8_len(&mut rng, len2);
 930        let mut chunk1 = Chunk::new(&str1);
 931        let chunk2 = Chunk::new(&str2);
 932        let char_offsets = char_offsets_with_end(&str2);
 933        let start_index = rng.random_range(0..char_offsets.len());
 934        let start_offset = char_offsets[start_index];
 935        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 936        let slice = chunk2.slice(start_offset..end_offset);
 937        let prefix_text = &str2[start_offset..end_offset];
 938        chunk1.prepend(slice);
 939        verify_chunk(chunk1.as_slice(), &(prefix_text.to_owned() + &str1));
 940    }
 941
 942    /// Return the byte offsets for each character in a string.
 943    ///
 944    /// These are valid offsets to split the string.
 945    fn char_offsets(text: &str) -> Vec<usize> {
 946        text.char_indices().map(|(i, _c)| i).collect()
 947    }
 948
 949    /// Return the byte offsets for each character in a string, plus the offset
 950    /// past the end of the string.
 951    fn char_offsets_with_end(text: &str) -> Vec<usize> {
 952        let mut v = char_offsets(text);
 953        v.push(text.len());
 954        v
 955    }
 956
 957    fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
 958        let mut offset = 0;
 959        let mut offset_utf16 = OffsetUtf16(0);
 960        let mut point = Point::zero();
 961        let mut point_utf16 = PointUtf16::zero();
 962
 963        log::info!("Verifying chunk {:?}", text);
 964        assert_eq!(chunk.offset_to_point(0), Point::zero());
 965
 966        let mut expected_tab_positions = Vec::new();
 967
 968        for (char_offset, c) in text.chars().enumerate() {
 969            let expected_point = chunk.offset_to_point(offset);
 970            assert_eq!(point, expected_point, "mismatch at offset {}", offset);
 971            assert_eq!(
 972                chunk.point_to_offset(point),
 973                offset,
 974                "mismatch at point {:?}",
 975                point
 976            );
 977            assert_eq!(
 978                chunk.offset_to_offset_utf16(offset),
 979                offset_utf16,
 980                "mismatch at offset {}",
 981                offset
 982            );
 983            assert_eq!(
 984                chunk.offset_utf16_to_offset(offset_utf16),
 985                offset,
 986                "mismatch at offset_utf16 {:?}",
 987                offset_utf16
 988            );
 989            assert_eq!(
 990                chunk.point_to_point_utf16(point),
 991                point_utf16,
 992                "mismatch at point {:?}",
 993                point
 994            );
 995            assert_eq!(
 996                chunk.point_utf16_to_offset(point_utf16, false),
 997                offset,
 998                "mismatch at point_utf16 {:?}",
 999                point_utf16
1000            );
1001            assert_eq!(
1002                chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1003                point,
1004                "mismatch for unclipped_point_utf16_to_point at {:?}",
1005                point_utf16
1006            );
1007
1008            assert_eq!(
1009                chunk.clip_point(point, Bias::Left),
1010                point,
1011                "incorrect left clip at {:?}",
1012                point
1013            );
1014            assert_eq!(
1015                chunk.clip_point(point, Bias::Right),
1016                point,
1017                "incorrect right clip at {:?}",
1018                point
1019            );
1020
1021            for i in 1..c.len_utf8() {
1022                let test_point = Point::new(point.row, point.column + i as u32);
1023                assert_eq!(
1024                    chunk.clip_point(test_point, Bias::Left),
1025                    point,
1026                    "incorrect left clip within multi-byte char at {:?}",
1027                    test_point
1028                );
1029                assert_eq!(
1030                    chunk.clip_point(test_point, Bias::Right),
1031                    Point::new(point.row, point.column + c.len_utf8() as u32),
1032                    "incorrect right clip within multi-byte char at {:?}",
1033                    test_point
1034                );
1035            }
1036
1037            for i in 1..c.len_utf16() {
1038                let test_point = Unclipped(PointUtf16::new(
1039                    point_utf16.row,
1040                    point_utf16.column + i as u32,
1041                ));
1042                assert_eq!(
1043                    chunk.unclipped_point_utf16_to_point(test_point),
1044                    point,
1045                    "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
1046                    test_point
1047                );
1048                assert_eq!(
1049                    chunk.clip_point_utf16(test_point, Bias::Left),
1050                    point_utf16,
1051                    "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1052                    test_point
1053                );
1054                assert_eq!(
1055                    chunk.clip_point_utf16(test_point, Bias::Right),
1056                    PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1057                    "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1058                    test_point
1059                );
1060
1061                let test_offset = OffsetUtf16(offset_utf16.0 + i);
1062                assert_eq!(
1063                    chunk.clip_offset_utf16(test_offset, Bias::Left),
1064                    offset_utf16,
1065                    "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1066                    test_offset
1067                );
1068                assert_eq!(
1069                    chunk.clip_offset_utf16(test_offset, Bias::Right),
1070                    OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1071                    "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1072                    test_offset
1073                );
1074            }
1075
1076            if c == '\n' {
1077                point.row += 1;
1078                point.column = 0;
1079                point_utf16.row += 1;
1080                point_utf16.column = 0;
1081            } else {
1082                point.column += c.len_utf8() as u32;
1083                point_utf16.column += c.len_utf16() as u32;
1084            }
1085
1086            if c == '\t' {
1087                expected_tab_positions.push(TabPosition {
1088                    byte_offset: offset,
1089                    char_offset,
1090                });
1091            }
1092
1093            offset += c.len_utf8();
1094            offset_utf16.0 += c.len_utf16();
1095        }
1096
1097        let final_point = chunk.offset_to_point(offset);
1098        assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1099        assert_eq!(
1100            chunk.point_to_offset(point),
1101            offset,
1102            "mismatch at point {:?}",
1103            point
1104        );
1105        assert_eq!(
1106            chunk.offset_to_offset_utf16(offset),
1107            offset_utf16,
1108            "mismatch at offset {}",
1109            offset
1110        );
1111        assert_eq!(
1112            chunk.offset_utf16_to_offset(offset_utf16),
1113            offset,
1114            "mismatch at offset_utf16 {:?}",
1115            offset_utf16
1116        );
1117        assert_eq!(
1118            chunk.point_to_point_utf16(point),
1119            point_utf16,
1120            "mismatch at final point {:?}",
1121            point
1122        );
1123        assert_eq!(
1124            chunk.point_utf16_to_offset(point_utf16, false),
1125            offset,
1126            "mismatch at final point_utf16 {:?}",
1127            point_utf16
1128        );
1129        assert_eq!(
1130            chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1131            point,
1132            "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1133            point_utf16
1134        );
1135        assert_eq!(
1136            chunk.clip_point(point, Bias::Left),
1137            point,
1138            "incorrect left clip at final point {:?}",
1139            point
1140        );
1141        assert_eq!(
1142            chunk.clip_point(point, Bias::Right),
1143            point,
1144            "incorrect right clip at final point {:?}",
1145            point
1146        );
1147        assert_eq!(
1148            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1149            point_utf16,
1150            "incorrect left clip_point_utf16 at final point {:?}",
1151            point_utf16
1152        );
1153        assert_eq!(
1154            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1155            point_utf16,
1156            "incorrect right clip_point_utf16 at final point {:?}",
1157            point_utf16
1158        );
1159        assert_eq!(
1160            chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1161            offset_utf16,
1162            "incorrect left clip_offset_utf16 at final offset {:?}",
1163            offset_utf16
1164        );
1165        assert_eq!(
1166            chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1167            offset_utf16,
1168            "incorrect right clip_offset_utf16 at final offset {:?}",
1169            offset_utf16
1170        );
1171
1172        // Verify length methods
1173        assert_eq!(chunk.len(), text.len());
1174        assert_eq!(
1175            chunk.len_utf16().0,
1176            text.chars().map(|c| c.len_utf16()).sum::<usize>()
1177        );
1178
1179        // Verify line counting
1180        let lines = chunk.lines();
1181        let mut newline_count = 0;
1182        let mut last_line_len = 0;
1183        for c in text.chars() {
1184            if c == '\n' {
1185                newline_count += 1;
1186                last_line_len = 0;
1187            } else {
1188                last_line_len += c.len_utf8() as u32;
1189            }
1190        }
1191        assert_eq!(lines, Point::new(newline_count, last_line_len));
1192
1193        // Verify first/last line chars
1194        if !text.is_empty() {
1195            let first_line = text.split('\n').next().unwrap();
1196            assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1197
1198            let last_line = text.split('\n').next_back().unwrap();
1199            assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1200            assert_eq!(
1201                chunk.last_line_len_utf16(),
1202                last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1203            );
1204        }
1205
1206        // Verify longest row
1207        let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1208        let mut max_chars = 0;
1209        let mut current_row = 0;
1210        let mut current_chars = 0;
1211        let mut max_row = 0;
1212
1213        for c in text.chars() {
1214            if c == '\n' {
1215                if current_chars > max_chars {
1216                    max_chars = current_chars;
1217                    max_row = current_row;
1218                }
1219                current_row += 1;
1220                current_chars = 0;
1221            } else {
1222                current_chars += 1;
1223            }
1224        }
1225
1226        if current_chars > max_chars {
1227            max_chars = current_chars;
1228            max_row = current_row;
1229        }
1230
1231        assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1232        assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1233    }
1234
1235    #[gpui::test]
1236    fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1237        let text = "abc\nde";
1238        let chunk = Chunk::new(text);
1239        let slice = chunk.as_slice();
1240
1241        // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1242        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1243
1244        // Clipping on row 1 — this is the case that was buggy.
1245        // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1246        // clipped result must be 6, not 2.
1247        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1248    }
1249}