chunk.rs

   1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
   2use arrayvec::ArrayString;
   3use std::{cmp, ops::Range};
   4use sum_tree::Bias;
   5use unicode_segmentation::GraphemeCursor;
   6use util::debug_panic;
   7
   8#[cfg(not(all(test, not(rust_analyzer))))]
   9pub(crate) type Bitmap = u128;
  10#[cfg(all(test, not(rust_analyzer)))]
  11pub(crate) type Bitmap = u16;
  12
  13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
  14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
  15
  16#[derive(Clone, Debug, Default)]
  17pub struct Chunk {
  18    /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
  19    /// text.
  20    chars: Bitmap,
  21    /// The number of set bits is the number of UTF-16 code units it would take to represent the
  22    /// text.
  23    ///
  24    /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
  25    /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
  26    /// than two UTF-16 code units.)
  27    chars_utf16: Bitmap,
  28    /// If bit[i] is set, then the character at index i is an ascii newline.
  29    newlines: Bitmap,
  30    /// If bit[i] is set, then the character at index i is an ascii tab.
  31    tabs: Bitmap,
  32    pub text: ArrayString<MAX_BASE>,
  33}
  34
  35impl Chunk {
  36    pub const MASK_BITS: usize = Bitmap::BITS as usize;
  37
  38    #[inline(always)]
  39    pub fn new(text: &str) -> Self {
  40        let mut this = Chunk::default();
  41        this.push_str(text);
  42        this
  43    }
  44
  45    #[inline(always)]
  46    pub fn push_str(&mut self, text: &str) {
  47        for (char_ix, c) in text.char_indices() {
  48            let ix = self.text.len() + char_ix;
  49            self.chars |= 1 << ix;
  50            self.chars_utf16 |= 1 << ix;
  51            self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
  52            self.newlines |= ((c == '\n') as Bitmap) << ix;
  53            self.tabs |= ((c == '\t') as Bitmap) << ix;
  54        }
  55        self.text.push_str(text);
  56    }
  57
  58    #[inline(always)]
  59    pub fn append(&mut self, slice: ChunkSlice) {
  60        if slice.is_empty() {
  61            return;
  62        };
  63
  64        let base_ix = self.text.len();
  65        self.chars |= slice.chars << base_ix;
  66        self.chars_utf16 |= slice.chars_utf16 << base_ix;
  67        self.newlines |= slice.newlines << base_ix;
  68        self.tabs |= slice.tabs << base_ix;
  69        self.text.push_str(slice.text);
  70    }
  71
  72    #[inline(always)]
  73    pub fn as_slice(&self) -> ChunkSlice<'_> {
  74        ChunkSlice {
  75            chars: self.chars,
  76            chars_utf16: self.chars_utf16,
  77            newlines: self.newlines,
  78            tabs: self.tabs,
  79            text: &self.text,
  80        }
  81    }
  82
  83    #[inline(always)]
  84    pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
  85        self.as_slice().slice(range)
  86    }
  87
  88    #[inline(always)]
  89    pub fn chars(&self) -> Bitmap {
  90        self.chars
  91    }
  92
  93    pub fn tabs(&self) -> Bitmap {
  94        self.tabs
  95    }
  96
  97    #[inline(always)]
  98    pub fn is_char_boundary(&self, offset: usize) -> bool {
  99        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 100    }
 101
 102    pub fn floor_char_boundary(&self, index: usize) -> usize {
 103        #[inline]
 104        pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
 105            // This is bit magic equivalent to: b < 128 || b >= 192
 106            (u8 as i8) >= -0x40
 107        }
 108
 109        if index >= self.text.len() {
 110            self.text.len()
 111        } else {
 112            let mut i = index;
 113            while i > 0 {
 114                if is_utf8_char_boundary(self.text.as_bytes()[i]) {
 115                    break;
 116                }
 117                i -= 1;
 118            }
 119
 120            i
 121        }
 122    }
 123
 124    #[track_caller]
 125    #[inline(always)]
 126    pub fn assert_char_boundary(&self, offset: usize) {
 127        if self.is_char_boundary(offset) {
 128            return;
 129        }
 130        panic_char_boundary(self, offset);
 131
 132        #[cold]
 133        #[inline(never)]
 134        #[track_caller]
 135        fn panic_char_boundary(chunk: &Chunk, offset: usize) {
 136            if offset > chunk.text.len() {
 137                panic!(
 138                    "byte index {} is out of bounds of `{:?}` (length: {})",
 139                    offset,
 140                    chunk.text,
 141                    chunk.text.len()
 142                );
 143            }
 144            // find the character
 145            let char_start = chunk.floor_char_boundary(offset);
 146            // `char_start` must be less than len and a char boundary
 147            let ch = chunk
 148                .text
 149                .get(char_start..)
 150                .unwrap()
 151                .chars()
 152                .next()
 153                .unwrap();
 154            let char_range = char_start..char_start + ch.len_utf8();
 155            panic!(
 156                "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 157                offset, ch, char_range,
 158            );
 159        }
 160    }
 161}
 162
 163#[derive(Clone, Copy, Debug)]
 164pub struct ChunkSlice<'a> {
 165    chars: Bitmap,
 166    chars_utf16: Bitmap,
 167    newlines: Bitmap,
 168    tabs: Bitmap,
 169    text: &'a str,
 170}
 171
 172impl Into<Chunk> for ChunkSlice<'_> {
 173    fn into(self) -> Chunk {
 174        Chunk {
 175            chars: self.chars,
 176            chars_utf16: self.chars_utf16,
 177            newlines: self.newlines,
 178            tabs: self.tabs,
 179            text: self.text.try_into().unwrap(),
 180        }
 181    }
 182}
 183
 184impl<'a> ChunkSlice<'a> {
 185    #[inline(always)]
 186    pub fn is_empty(&self) -> bool {
 187        self.text.is_empty()
 188    }
 189
 190    #[inline(always)]
 191    pub fn is_char_boundary(&self, offset: usize) -> bool {
 192        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 193    }
 194
 195    #[inline(always)]
 196    pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
 197        if mid == MAX_BASE {
 198            let left = self;
 199            let right = ChunkSlice {
 200                chars: 0,
 201                chars_utf16: 0,
 202                newlines: 0,
 203                tabs: 0,
 204                text: "",
 205            };
 206            (left, right)
 207        } else {
 208            let mask = ((1 as Bitmap) << mid) - 1;
 209            let (left_text, right_text) = self.text.split_at(mid);
 210            let left = ChunkSlice {
 211                chars: self.chars & mask,
 212                chars_utf16: self.chars_utf16 & mask,
 213                newlines: self.newlines & mask,
 214                tabs: self.tabs & mask,
 215                text: left_text,
 216            };
 217            let right = ChunkSlice {
 218                chars: self.chars >> mid,
 219                chars_utf16: self.chars_utf16 >> mid,
 220                newlines: self.newlines >> mid,
 221                tabs: self.tabs >> mid,
 222                text: right_text,
 223            };
 224            (left, right)
 225        }
 226    }
 227
 228    #[inline(always)]
 229    pub fn slice(self, range: Range<usize>) -> Self {
 230        let mask = (1 as Bitmap)
 231            .unbounded_shl(range.end as u32)
 232            .wrapping_sub(1);
 233        if range.start == MAX_BASE {
 234            Self {
 235                chars: 0,
 236                chars_utf16: 0,
 237                newlines: 0,
 238                tabs: 0,
 239                text: "",
 240            }
 241        } else {
 242            self.assert_char_boundary(range.start);
 243            self.assert_char_boundary(range.end);
 244            Self {
 245                chars: (self.chars & mask) >> range.start,
 246                chars_utf16: (self.chars_utf16 & mask) >> range.start,
 247                newlines: (self.newlines & mask) >> range.start,
 248                tabs: (self.tabs & mask) >> range.start,
 249                text: &self.text[range],
 250            }
 251        }
 252    }
 253
 254    #[inline(always)]
 255    pub fn text_summary(&self) -> TextSummary {
 256        let mut chars = 0;
 257        let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
 258        TextSummary {
 259            len: self.len(),
 260            chars,
 261            len_utf16: self.len_utf16(),
 262            lines: self.lines(),
 263            first_line_chars: self.first_line_chars(),
 264            last_line_chars: self.last_line_chars(),
 265            last_line_len_utf16: self.last_line_len_utf16(),
 266            longest_row,
 267            longest_row_chars,
 268        }
 269    }
 270
 271    /// Get length in bytes
 272    #[inline(always)]
 273    pub fn len(&self) -> usize {
 274        self.text.len()
 275    }
 276
 277    /// Get length in UTF-16 code units
 278    #[inline(always)]
 279    pub fn len_utf16(&self) -> OffsetUtf16 {
 280        OffsetUtf16(self.chars_utf16.count_ones() as usize)
 281    }
 282
 283    /// Get point representing number of lines and length of last line
 284    #[inline(always)]
 285    pub fn lines(&self) -> Point {
 286        let row = self.newlines.count_ones();
 287        let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
 288        Point::new(row, column)
 289    }
 290
 291    /// Get number of chars in first line
 292    #[inline(always)]
 293    pub fn first_line_chars(&self) -> u32 {
 294        if self.newlines == 0 {
 295            self.chars.count_ones()
 296        } else {
 297            let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
 298            (self.chars & mask).count_ones()
 299        }
 300    }
 301
 302    /// Get number of chars in last line
 303    #[inline(always)]
 304    pub fn last_line_chars(&self) -> u32 {
 305        if self.newlines == 0 {
 306            self.chars.count_ones()
 307        } else {
 308            let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
 309            (self.chars & mask).count_ones()
 310        }
 311    }
 312
 313    /// Get number of UTF-16 code units in last line
 314    #[inline(always)]
 315    pub fn last_line_len_utf16(&self) -> u32 {
 316        if self.newlines == 0 {
 317            self.chars_utf16.count_ones()
 318        } else {
 319            let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
 320            (self.chars_utf16 & mask).count_ones()
 321        }
 322    }
 323
 324    /// Get the longest row in the chunk and its length in characters.
 325    /// Calculate the total number of characters in the chunk along the way.
 326    #[inline(always)]
 327    pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
 328        let mut chars = self.chars;
 329        let mut newlines = self.newlines;
 330        *total_chars = 0;
 331        let mut row = 0;
 332        let mut longest_row = 0;
 333        let mut longest_row_chars = 0;
 334        while newlines > 0 {
 335            let newline_ix = newlines.trailing_zeros();
 336            let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
 337            *total_chars += usize::from(row_chars);
 338            if row_chars > longest_row_chars {
 339                longest_row = row;
 340                longest_row_chars = row_chars;
 341            }
 342
 343            newlines >>= newline_ix;
 344            newlines >>= 1;
 345            chars >>= newline_ix;
 346            chars >>= 1;
 347            row += 1;
 348            *total_chars += 1;
 349        }
 350
 351        let row_chars = chars.count_ones() as u8;
 352        *total_chars += usize::from(row_chars);
 353        if row_chars > longest_row_chars {
 354            (row, row_chars as u32)
 355        } else {
 356            (longest_row, longest_row_chars as u32)
 357        }
 358    }
 359
 360    #[inline(always)]
 361    pub fn offset_to_point(&self, offset: usize) -> Point {
 362        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 363        let row = (self.newlines & mask).count_ones();
 364        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 365        let column = (offset - newline_ix as usize) as u32;
 366        Point::new(row, column)
 367    }
 368
 369    #[inline(always)]
 370    pub fn point_to_offset(&self, point: Point) -> usize {
 371        if point.row > self.lines().row {
 372            debug_panic!(
 373                "point {:?} extends beyond rows for string {:?}",
 374                point,
 375                self.text
 376            );
 377            return self.len();
 378        }
 379
 380        let row_offset_range = self.offset_range_for_row(point.row);
 381        if point.column > row_offset_range.len() as u32 {
 382            debug_panic!(
 383                "point {:?} extends beyond row for string {:?}",
 384                point,
 385                self.text
 386            );
 387            row_offset_range.end
 388        } else {
 389            row_offset_range.start + point.column as usize
 390        }
 391    }
 392
 393    #[track_caller]
 394    #[inline(always)]
 395    pub fn assert_char_boundary(&self, offset: usize) {
 396        if self.is_char_boundary(offset) {
 397            return;
 398        }
 399        panic_char_boundary(self, offset);
 400
 401        #[cold]
 402        #[inline(never)]
 403        fn panic_char_boundary(chunk: &ChunkSlice, offset: usize) {
 404            if offset > chunk.text.len() {
 405                panic!(
 406                    "byte index {} is out of bounds of `{:?}` (length: {})",
 407                    offset,
 408                    chunk.text,
 409                    chunk.text.len()
 410                );
 411            }
 412            // find the character
 413            let char_start = chunk.floor_char_boundary(offset);
 414            // `char_start` must be less than len and a char boundary
 415            let ch = chunk
 416                .text
 417                .get(char_start..)
 418                .unwrap()
 419                .chars()
 420                .next()
 421                .unwrap();
 422            let char_range = char_start..char_start + ch.len_utf8();
 423            panic!(
 424                "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 425                offset, ch, char_range,
 426            );
 427        }
 428    }
 429
 430    pub fn floor_char_boundary(&self, index: usize) -> usize {
 431        #[inline]
 432        pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
 433            // This is bit magic equivalent to: b < 128 || b >= 192
 434            (u8 as i8) >= -0x40
 435        }
 436
 437        if index >= self.text.len() {
 438            self.text.len()
 439        } else {
 440            let mut i = index;
 441            while i > 0 {
 442                if is_utf8_char_boundary(self.text.as_bytes()[i]) {
 443                    break;
 444                }
 445                i -= 1;
 446            }
 447
 448            i
 449        }
 450    }
 451
 452    #[inline(always)]
 453    pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
 454        if point.row > self.lines().row {
 455            debug_panic!(
 456                "point {:?} extends beyond rows for string {:?}",
 457                point,
 458                self.text
 459            );
 460            return self.len_utf16();
 461        }
 462        self.offset_to_offset_utf16(self.point_to_offset(point))
 463    }
 464
 465    #[inline(always)]
 466    pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
 467        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 468        OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
 469    }
 470
 471    #[inline(always)]
 472    pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
 473        if target.0 == 0 {
 474            0
 475        } else {
 476            #[cfg(not(test))]
 477            let chars_utf16 = self.chars_utf16;
 478            #[cfg(test)]
 479            let chars_utf16 = self.chars_utf16 as u128;
 480            let ix = nth_set_bit(chars_utf16, target.0) + 1;
 481            if ix == MAX_BASE {
 482                MAX_BASE
 483            } else {
 484                let utf8_additional_len = cmp::min(
 485                    (self.chars_utf16 >> ix).trailing_zeros() as usize,
 486                    self.text.len() - ix,
 487                );
 488                ix + utf8_additional_len
 489            }
 490        }
 491    }
 492
 493    #[inline(always)]
 494    pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
 495        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 496        let row = (self.newlines & mask).count_ones();
 497        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 498        let column = if newline_ix as usize == MAX_BASE {
 499            0
 500        } else {
 501            ((self.chars_utf16 & mask) >> newline_ix).count_ones()
 502        };
 503        PointUtf16::new(row, column)
 504    }
 505
 506    #[inline(always)]
 507    pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
 508        self.offset_to_point_utf16(self.point_to_offset(point))
 509    }
 510
 511    #[inline(always)]
 512    pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
 513        let lines = self.lines();
 514        if point.row > lines.row {
 515            if !clip {
 516                debug_panic!(
 517                    "point {:?} is beyond this chunk's extent {:?}",
 518                    point,
 519                    self.text
 520                );
 521            }
 522            return self.len();
 523        }
 524
 525        let row_offset_range = self.offset_range_for_row(point.row);
 526        let line = self.slice(row_offset_range.clone());
 527        if point.column > line.last_line_len_utf16() {
 528            if !clip {
 529                debug_panic!(
 530                    "point {:?} is beyond the end of the line in chunk {:?}",
 531                    point,
 532                    self.text
 533                );
 534            }
 535            return line.len();
 536        }
 537
 538        let mut offset = row_offset_range.start;
 539        if point.column > 0 {
 540            offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
 541            if !self.text.is_char_boundary(offset) {
 542                offset -= 1;
 543                while !self.text.is_char_boundary(offset) {
 544                    offset -= 1;
 545                }
 546                if !clip {
 547                    debug_panic!(
 548                        "point {:?} is within character in chunk {:?}",
 549                        point,
 550                        self.text,
 551                    );
 552                }
 553            }
 554        }
 555        offset
 556    }
 557
 558    #[inline(always)]
 559    pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
 560        let max_point = self.lines();
 561        if point.0.row > max_point.row {
 562            return max_point;
 563        }
 564
 565        let row_offset_range = self.offset_range_for_row(point.0.row);
 566        let line = self.slice(row_offset_range);
 567        if point.0.column == 0 {
 568            Point::new(point.0.row, 0)
 569        } else if point.0.column >= line.len_utf16().0 as u32 {
 570            Point::new(point.0.row, line.len() as u32)
 571        } else {
 572            let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
 573            while !line.text.is_char_boundary(column) {
 574                column -= 1;
 575            }
 576            Point::new(point.0.row, column as u32)
 577        }
 578    }
 579
 580    #[inline(always)]
 581    pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
 582        let max_point = self.lines();
 583        if point.row > max_point.row {
 584            return max_point;
 585        }
 586
 587        let line = self.slice(self.offset_range_for_row(point.row));
 588        if point.column == 0 {
 589            point
 590        } else if point.column >= line.len() as u32 {
 591            Point::new(point.row, line.len() as u32)
 592        } else {
 593            let mut column = point.column as usize;
 594            let bytes = line.text.as_bytes();
 595            if bytes[column - 1] < 128 && bytes[column] < 128 {
 596                return Point::new(point.row, column as u32);
 597            }
 598
 599            let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
 600            loop {
 601                if line.is_char_boundary(column)
 602                    && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
 603                {
 604                    break;
 605                }
 606
 607                match bias {
 608                    Bias::Left => column -= 1,
 609                    Bias::Right => column += 1,
 610                }
 611                grapheme_cursor.set_cursor(column);
 612            }
 613            Point::new(point.row, column as u32)
 614        }
 615    }
 616
 617    #[inline(always)]
 618    pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
 619        let max_point = self.lines();
 620        if point.0.row > max_point.row {
 621            PointUtf16::new(max_point.row, self.last_line_len_utf16())
 622        } else {
 623            let line = self.slice(self.offset_range_for_row(point.0.row));
 624            let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
 625            PointUtf16::new(point.0.row, column.0 as u32)
 626        }
 627    }
 628
 629    #[inline(always)]
 630    pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
 631        if target == OffsetUtf16::default() {
 632            OffsetUtf16::default()
 633        } else if target >= self.len_utf16() {
 634            self.len_utf16()
 635        } else {
 636            let mut offset = self.offset_utf16_to_offset(target);
 637            while !self.text.is_char_boundary(offset) {
 638                if bias == Bias::Left {
 639                    offset -= 1;
 640                } else {
 641                    offset += 1;
 642                }
 643            }
 644            self.offset_to_offset_utf16(offset)
 645        }
 646    }
 647
 648    #[inline(always)]
 649    fn offset_range_for_row(&self, row: u32) -> Range<usize> {
 650        let row_start = if row > 0 {
 651            #[cfg(not(test))]
 652            let newlines = self.newlines;
 653            #[cfg(test)]
 654            let newlines = self.newlines as u128;
 655            nth_set_bit(newlines, row as usize) + 1
 656        } else {
 657            0
 658        };
 659        let row_len = if row_start == MAX_BASE {
 660            0
 661        } else {
 662            cmp::min(
 663                (self.newlines >> row_start).trailing_zeros(),
 664                (self.text.len() - row_start) as u32,
 665            )
 666        };
 667        row_start..row_start + row_len as usize
 668    }
 669
 670    #[inline(always)]
 671    pub fn tabs(&self) -> Tabs {
 672        Tabs {
 673            tabs: self.tabs,
 674            chars: self.chars,
 675        }
 676    }
 677}
 678
 679pub struct Tabs {
 680    tabs: Bitmap,
 681    chars: Bitmap,
 682}
 683
 684#[derive(Debug, PartialEq, Eq)]
 685pub struct TabPosition {
 686    pub byte_offset: usize,
 687    pub char_offset: usize,
 688}
 689
 690impl Iterator for Tabs {
 691    type Item = TabPosition;
 692
 693    fn next(&mut self) -> Option<Self::Item> {
 694        if self.tabs == 0 {
 695            return None;
 696        }
 697
 698        let tab_offset = self.tabs.trailing_zeros() as usize;
 699        let chars_mask = (1 << tab_offset) - 1;
 700        let char_offset = (self.chars & chars_mask).count_ones() as usize;
 701
 702        // Since tabs are 1 byte the tab offset is the same as the byte offset
 703        let position = TabPosition {
 704            byte_offset: tab_offset,
 705            char_offset,
 706        };
 707        // Remove the tab we've just seen
 708        self.tabs ^= 1 << tab_offset;
 709
 710        Some(position)
 711    }
 712}
 713
 714/// Finds the n-th bit that is set to 1.
 715#[inline(always)]
 716fn nth_set_bit(v: u128, n: usize) -> usize {
 717    let low = v as u64;
 718    let high = (v >> 64) as u64;
 719
 720    let low_count = low.count_ones() as usize;
 721    if n > low_count {
 722        64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
 723    } else {
 724        nth_set_bit_u64(low, n as u64) as usize
 725    }
 726}
 727
 728#[inline(always)]
 729fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
 730    let v = v.reverse_bits();
 731    let mut s: u64 = 64;
 732
 733    // Parallel bit count intermediates
 734    let a = v - ((v >> 1) & (u64::MAX / 3));
 735    let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
 736    let c = (b + (b >> 4)) & (u64::MAX / 0x11);
 737    let d = (c + (c >> 8)) & (u64::MAX / 0x101);
 738
 739    // Branchless select
 740    let t = (d >> 32) + (d >> 48);
 741    s -= (t.wrapping_sub(n) & 256) >> 3;
 742    n -= t & (t.wrapping_sub(n) >> 8);
 743
 744    let t = (d >> (s - 16)) & 0xff;
 745    s -= (t.wrapping_sub(n) & 256) >> 4;
 746    n -= t & (t.wrapping_sub(n) >> 8);
 747
 748    let t = (c >> (s - 8)) & 0xf;
 749    s -= (t.wrapping_sub(n) & 256) >> 5;
 750    n -= t & (t.wrapping_sub(n) >> 8);
 751
 752    let t = (b >> (s - 4)) & 0x7;
 753    s -= (t.wrapping_sub(n) & 256) >> 6;
 754    n -= t & (t.wrapping_sub(n) >> 8);
 755
 756    let t = (a >> (s - 2)) & 0x3;
 757    s -= (t.wrapping_sub(n) & 256) >> 7;
 758    n -= t & (t.wrapping_sub(n) >> 8);
 759
 760    let t = (v >> (s - 1)) & 0x1;
 761    s -= (t.wrapping_sub(n) & 256) >> 8;
 762
 763    65 - s - 1
 764}
 765
 766#[cfg(test)]
 767mod tests {
 768    use super::*;
 769    use rand::prelude::*;
 770    use util::RandomCharIter;
 771
 772    #[gpui::test(iterations = 100)]
 773    fn test_random_chunks(mut rng: StdRng) {
 774        let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
 775        log::info!("Chunk: {:?}", text);
 776        let chunk = Chunk::new(&text);
 777        verify_chunk(chunk.as_slice(), &text);
 778
 779        // Verify Chunk::chars() bitmap
 780        let expected_chars = char_offsets(&text)
 781            .into_iter()
 782            .inspect(|i| assert!(*i < MAX_BASE))
 783            .fold(0 as Bitmap, |acc, i| acc | (1 << i));
 784        assert_eq!(chunk.chars(), expected_chars);
 785
 786        for _ in 0..10 {
 787            let mut start = rng.random_range(0..=chunk.text.len());
 788            let mut end = rng.random_range(start..=chunk.text.len());
 789            while !chunk.text.is_char_boundary(start) {
 790                start -= 1;
 791            }
 792            while !chunk.text.is_char_boundary(end) {
 793                end -= 1;
 794            }
 795            let range = start..end;
 796            log::info!("Range: {:?}", range);
 797            let text_slice = &text[range.clone()];
 798            let chunk_slice = chunk.slice(range);
 799            verify_chunk(chunk_slice, text_slice);
 800        }
 801    }
 802
 803    #[gpui::test(iterations = 100)]
 804    fn test_split_chunk_slice(mut rng: StdRng) {
 805        let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
 806        let chunk = Chunk::new(text);
 807        let offset = char_offsets_with_end(text)
 808            .into_iter()
 809            .choose(&mut rng)
 810            .unwrap();
 811        let (a, b) = chunk.as_slice().split_at(offset);
 812        let (a_str, b_str) = text.split_at(offset);
 813        verify_chunk(a, a_str);
 814        verify_chunk(b, b_str);
 815    }
 816
 817    #[gpui::test(iterations = 1000)]
 818    fn test_nth_set_bit_random(mut rng: StdRng) {
 819        let set_count = rng.random_range(0..=128);
 820        let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
 821        set_bits.sort();
 822        let mut n = 0;
 823        for ix in set_bits.iter().copied() {
 824            n |= 1 << ix;
 825        }
 826
 827        for (mut ix, position) in set_bits.into_iter().enumerate() {
 828            ix += 1;
 829            assert_eq!(
 830                nth_set_bit(n, ix),
 831                position,
 832                "nth_set_bit({:0128b}, {})",
 833                n,
 834                ix
 835            );
 836        }
 837    }
 838
 839    /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
 840    fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
 841        let mut str = String::new();
 842        let mut chars = RandomCharIter::new(rng);
 843        loop {
 844            let ch = chars.next().unwrap();
 845            if str.len() + ch.len_utf8() > len {
 846                break;
 847            }
 848            str.push(ch);
 849        }
 850        str
 851    }
 852
 853    #[gpui::test(iterations = 1000)]
 854    fn test_append_random_strings(mut rng: StdRng) {
 855        let len1 = rng.random_range(0..=MAX_BASE);
 856        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 857        let str1 = random_string_with_utf8_len(&mut rng, len1);
 858        let str2 = random_string_with_utf8_len(&mut rng, len2);
 859        let mut chunk1 = Chunk::new(&str1);
 860        let chunk2 = Chunk::new(&str2);
 861        let char_offsets = char_offsets_with_end(&str2);
 862        let start_index = rng.random_range(0..char_offsets.len());
 863        let start_offset = char_offsets[start_index];
 864        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 865        chunk1.append(chunk2.slice(start_offset..end_offset));
 866        verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
 867    }
 868
 869    /// Return the byte offsets for each character in a string.
 870    ///
 871    /// These are valid offsets to split the string.
 872    fn char_offsets(text: &str) -> Vec<usize> {
 873        text.char_indices().map(|(i, _c)| i).collect()
 874    }
 875
 876    /// Return the byte offsets for each character in a string, plus the offset
 877    /// past the end of the string.
 878    fn char_offsets_with_end(text: &str) -> Vec<usize> {
 879        let mut v = char_offsets(text);
 880        v.push(text.len());
 881        v
 882    }
 883
 884    fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
 885        let mut offset = 0;
 886        let mut offset_utf16 = OffsetUtf16(0);
 887        let mut point = Point::zero();
 888        let mut point_utf16 = PointUtf16::zero();
 889
 890        log::info!("Verifying chunk {:?}", text);
 891        assert_eq!(chunk.offset_to_point(0), Point::zero());
 892
 893        let mut expected_tab_positions = Vec::new();
 894
 895        for (char_offset, c) in text.chars().enumerate() {
 896            let expected_point = chunk.offset_to_point(offset);
 897            assert_eq!(point, expected_point, "mismatch at offset {}", offset);
 898            assert_eq!(
 899                chunk.point_to_offset(point),
 900                offset,
 901                "mismatch at point {:?}",
 902                point
 903            );
 904            assert_eq!(
 905                chunk.offset_to_offset_utf16(offset),
 906                offset_utf16,
 907                "mismatch at offset {}",
 908                offset
 909            );
 910            assert_eq!(
 911                chunk.offset_utf16_to_offset(offset_utf16),
 912                offset,
 913                "mismatch at offset_utf16 {:?}",
 914                offset_utf16
 915            );
 916            assert_eq!(
 917                chunk.point_to_point_utf16(point),
 918                point_utf16,
 919                "mismatch at point {:?}",
 920                point
 921            );
 922            assert_eq!(
 923                chunk.point_utf16_to_offset(point_utf16, false),
 924                offset,
 925                "mismatch at point_utf16 {:?}",
 926                point_utf16
 927            );
 928            assert_eq!(
 929                chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
 930                point,
 931                "mismatch for unclipped_point_utf16_to_point at {:?}",
 932                point_utf16
 933            );
 934
 935            assert_eq!(
 936                chunk.clip_point(point, Bias::Left),
 937                point,
 938                "incorrect left clip at {:?}",
 939                point
 940            );
 941            assert_eq!(
 942                chunk.clip_point(point, Bias::Right),
 943                point,
 944                "incorrect right clip at {:?}",
 945                point
 946            );
 947
 948            for i in 1..c.len_utf8() {
 949                let test_point = Point::new(point.row, point.column + i as u32);
 950                assert_eq!(
 951                    chunk.clip_point(test_point, Bias::Left),
 952                    point,
 953                    "incorrect left clip within multi-byte char at {:?}",
 954                    test_point
 955                );
 956                assert_eq!(
 957                    chunk.clip_point(test_point, Bias::Right),
 958                    Point::new(point.row, point.column + c.len_utf8() as u32),
 959                    "incorrect right clip within multi-byte char at {:?}",
 960                    test_point
 961                );
 962            }
 963
 964            for i in 1..c.len_utf16() {
 965                let test_point = Unclipped(PointUtf16::new(
 966                    point_utf16.row,
 967                    point_utf16.column + i as u32,
 968                ));
 969                assert_eq!(
 970                    chunk.unclipped_point_utf16_to_point(test_point),
 971                    point,
 972                    "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
 973                    test_point
 974                );
 975                assert_eq!(
 976                    chunk.clip_point_utf16(test_point, Bias::Left),
 977                    point_utf16,
 978                    "incorrect left clip_point_utf16 within multi-byte char at {:?}",
 979                    test_point
 980                );
 981                assert_eq!(
 982                    chunk.clip_point_utf16(test_point, Bias::Right),
 983                    PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
 984                    "incorrect right clip_point_utf16 within multi-byte char at {:?}",
 985                    test_point
 986                );
 987
 988                let test_offset = OffsetUtf16(offset_utf16.0 + i);
 989                assert_eq!(
 990                    chunk.clip_offset_utf16(test_offset, Bias::Left),
 991                    offset_utf16,
 992                    "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
 993                    test_offset
 994                );
 995                assert_eq!(
 996                    chunk.clip_offset_utf16(test_offset, Bias::Right),
 997                    OffsetUtf16(offset_utf16.0 + c.len_utf16()),
 998                    "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
 999                    test_offset
1000                );
1001            }
1002
1003            if c == '\n' {
1004                point.row += 1;
1005                point.column = 0;
1006                point_utf16.row += 1;
1007                point_utf16.column = 0;
1008            } else {
1009                point.column += c.len_utf8() as u32;
1010                point_utf16.column += c.len_utf16() as u32;
1011            }
1012
1013            if c == '\t' {
1014                expected_tab_positions.push(TabPosition {
1015                    byte_offset: offset,
1016                    char_offset,
1017                });
1018            }
1019
1020            offset += c.len_utf8();
1021            offset_utf16.0 += c.len_utf16();
1022        }
1023
1024        let final_point = chunk.offset_to_point(offset);
1025        assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1026        assert_eq!(
1027            chunk.point_to_offset(point),
1028            offset,
1029            "mismatch at point {:?}",
1030            point
1031        );
1032        assert_eq!(
1033            chunk.offset_to_offset_utf16(offset),
1034            offset_utf16,
1035            "mismatch at offset {}",
1036            offset
1037        );
1038        assert_eq!(
1039            chunk.offset_utf16_to_offset(offset_utf16),
1040            offset,
1041            "mismatch at offset_utf16 {:?}",
1042            offset_utf16
1043        );
1044        assert_eq!(
1045            chunk.point_to_point_utf16(point),
1046            point_utf16,
1047            "mismatch at final point {:?}",
1048            point
1049        );
1050        assert_eq!(
1051            chunk.point_utf16_to_offset(point_utf16, false),
1052            offset,
1053            "mismatch at final point_utf16 {:?}",
1054            point_utf16
1055        );
1056        assert_eq!(
1057            chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1058            point,
1059            "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1060            point_utf16
1061        );
1062        assert_eq!(
1063            chunk.clip_point(point, Bias::Left),
1064            point,
1065            "incorrect left clip at final point {:?}",
1066            point
1067        );
1068        assert_eq!(
1069            chunk.clip_point(point, Bias::Right),
1070            point,
1071            "incorrect right clip at final point {:?}",
1072            point
1073        );
1074        assert_eq!(
1075            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1076            point_utf16,
1077            "incorrect left clip_point_utf16 at final point {:?}",
1078            point_utf16
1079        );
1080        assert_eq!(
1081            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1082            point_utf16,
1083            "incorrect right clip_point_utf16 at final point {:?}",
1084            point_utf16
1085        );
1086        assert_eq!(
1087            chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1088            offset_utf16,
1089            "incorrect left clip_offset_utf16 at final offset {:?}",
1090            offset_utf16
1091        );
1092        assert_eq!(
1093            chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1094            offset_utf16,
1095            "incorrect right clip_offset_utf16 at final offset {:?}",
1096            offset_utf16
1097        );
1098
1099        // Verify length methods
1100        assert_eq!(chunk.len(), text.len());
1101        assert_eq!(
1102            chunk.len_utf16().0,
1103            text.chars().map(|c| c.len_utf16()).sum::<usize>()
1104        );
1105
1106        // Verify line counting
1107        let lines = chunk.lines();
1108        let mut newline_count = 0;
1109        let mut last_line_len = 0;
1110        for c in text.chars() {
1111            if c == '\n' {
1112                newline_count += 1;
1113                last_line_len = 0;
1114            } else {
1115                last_line_len += c.len_utf8() as u32;
1116            }
1117        }
1118        assert_eq!(lines, Point::new(newline_count, last_line_len));
1119
1120        // Verify first/last line chars
1121        if !text.is_empty() {
1122            let first_line = text.split('\n').next().unwrap();
1123            assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1124
1125            let last_line = text.split('\n').next_back().unwrap();
1126            assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1127            assert_eq!(
1128                chunk.last_line_len_utf16(),
1129                last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1130            );
1131        }
1132
1133        // Verify longest row
1134        let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1135        let mut max_chars = 0;
1136        let mut current_row = 0;
1137        let mut current_chars = 0;
1138        let mut max_row = 0;
1139
1140        for c in text.chars() {
1141            if c == '\n' {
1142                if current_chars > max_chars {
1143                    max_chars = current_chars;
1144                    max_row = current_row;
1145                }
1146                current_row += 1;
1147                current_chars = 0;
1148            } else {
1149                current_chars += 1;
1150            }
1151        }
1152
1153        if current_chars > max_chars {
1154            max_chars = current_chars;
1155            max_row = current_row;
1156        }
1157
1158        assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1159        assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1160    }
1161}