chunk.rs

   1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
   2use arrayvec::ArrayString;
   3use std::{cmp, ops::Range};
   4use sum_tree::Bias;
   5use unicode_segmentation::GraphemeCursor;
   6use util::debug_panic;
   7
   8#[cfg(not(all(test, not(rust_analyzer))))]
   9pub(crate) type Bitmap = u128;
  10#[cfg(all(test, not(rust_analyzer)))]
  11pub(crate) type Bitmap = u16;
  12
  13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
  14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
  15
  16#[derive(Clone, Debug, Default)]
  17pub struct Chunk {
  18    /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
  19    /// text.
  20    chars: Bitmap,
  21    /// The number of set bits is the number of UTF-16 code units it would take to represent the
  22    /// text.
  23    ///
  24    /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
  25    /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
  26    /// than two UTF-16 code units.)
  27    chars_utf16: Bitmap,
  28    /// If bit[i] is set, then the character at index i is an ascii newline.
  29    newlines: Bitmap,
  30    /// If bit[i] is set, then the character at index i is an ascii tab.
  31    tabs: Bitmap,
  32    pub text: ArrayString<MAX_BASE>,
  33}
  34
  35#[inline(always)]
  36const fn saturating_shl_mask(offset: u32) -> Bitmap {
  37    (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
  38}
  39
  40#[inline(always)]
  41const fn saturating_shr_mask(offset: u32) -> Bitmap {
  42    !Bitmap::MAX.unbounded_shr(offset)
  43}
  44
  45impl Chunk {
  46    pub const MASK_BITS: usize = Bitmap::BITS as usize;
  47
  48    #[inline(always)]
  49    pub fn new(text: &str) -> Self {
  50        let text = ArrayString::from(text).unwrap();
  51
  52        const CHUNK_SIZE: usize = 8;
  53
  54        let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
  55        let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
  56        let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
  57        let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
  58
  59        let mut chunk_ix = 0;
  60
  61        let mut bytes = text.as_bytes();
  62        while !bytes.is_empty() {
  63            let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
  64            bytes = rest;
  65
  66            let mut chars = 0;
  67            let mut newlines = 0;
  68            let mut tabs = 0;
  69            let mut chars_utf16 = 0;
  70
  71            for (ix, &b) in chunk.iter().enumerate() {
  72                chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
  73                newlines |= ((b == b'\n') as u8) << ix;
  74                tabs |= ((b == b'\t') as u8) << ix;
  75                // b >= 240 when we are at the first byte of the 4 byte encoded
  76                // utf-8 code point (U+010000 or greater) it means that it would
  77                // be encoded as two 16-bit code units in utf-16
  78                chars_utf16 |= ((b >= 240) as u8) << ix;
  79            }
  80
  81            chars_bytes[chunk_ix] = chars;
  82            newlines_bytes[chunk_ix] = newlines;
  83            tabs_bytes[chunk_ix] = tabs;
  84            chars_utf16_bytes[chunk_ix] = chars_utf16;
  85
  86            chunk_ix += 1;
  87        }
  88
  89        let chars = Bitmap::from_le_bytes(chars_bytes);
  90
  91        Chunk {
  92            text,
  93            chars,
  94            chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
  95            newlines: Bitmap::from_le_bytes(newlines_bytes),
  96            tabs: Bitmap::from_le_bytes(tabs_bytes),
  97        }
  98    }
  99
 100    #[inline(always)]
 101    pub fn push_str(&mut self, text: &str) {
 102        self.append(Chunk::new(text).as_slice());
 103    }
 104
 105    #[inline(always)]
 106    pub fn append(&mut self, slice: ChunkSlice) {
 107        if slice.is_empty() {
 108            return;
 109        };
 110
 111        let base_ix = self.text.len();
 112        self.chars |= slice.chars << base_ix;
 113        self.chars_utf16 |= slice.chars_utf16 << base_ix;
 114        self.newlines |= slice.newlines << base_ix;
 115        self.tabs |= slice.tabs << base_ix;
 116        self.text.push_str(slice.text);
 117    }
 118
 119    #[inline(always)]
 120    pub fn as_slice(&self) -> ChunkSlice<'_> {
 121        ChunkSlice {
 122            chars: self.chars,
 123            chars_utf16: self.chars_utf16,
 124            newlines: self.newlines,
 125            tabs: self.tabs,
 126            text: &self.text,
 127        }
 128    }
 129
 130    #[inline(always)]
 131    pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
 132        self.as_slice().slice(range)
 133    }
 134
 135    #[inline(always)]
 136    pub fn chars(&self) -> Bitmap {
 137        self.chars
 138    }
 139
 140    #[inline(always)]
 141    pub fn tabs(&self) -> Bitmap {
 142        self.tabs
 143    }
 144
 145    #[inline(always)]
 146    pub fn newlines(&self) -> Bitmap {
 147        self.newlines
 148    }
 149
 150    #[inline(always)]
 151    pub fn is_char_boundary(&self, offset: usize) -> bool {
 152        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 153    }
 154
 155    pub fn floor_char_boundary(&self, index: usize) -> usize {
 156        if index >= self.text.len() {
 157            self.text.len()
 158        } else {
 159            let mut i = index;
 160            while i > 0 {
 161                if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
 162                    break;
 163                }
 164                i -= 1;
 165            }
 166
 167            i
 168        }
 169    }
 170
 171    #[track_caller]
 172    #[inline(always)]
 173    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 174        if self.is_char_boundary(offset) {
 175            return true;
 176        }
 177        if PANIC || cfg!(debug_assertions) {
 178            panic_char_boundary(&self.text, offset);
 179        } else {
 180            log_err_char_boundary(&self.text, offset);
 181            false
 182        }
 183    }
 184}
 185
 186#[derive(Clone, Copy, Debug)]
 187pub struct ChunkSlice<'a> {
 188    chars: Bitmap,
 189    chars_utf16: Bitmap,
 190    newlines: Bitmap,
 191    tabs: Bitmap,
 192    text: &'a str,
 193}
 194
 195impl Into<Chunk> for ChunkSlice<'_> {
 196    fn into(self) -> Chunk {
 197        Chunk {
 198            chars: self.chars,
 199            chars_utf16: self.chars_utf16,
 200            newlines: self.newlines,
 201            tabs: self.tabs,
 202            text: self.text.try_into().unwrap(),
 203        }
 204    }
 205}
 206
 207impl<'a> ChunkSlice<'a> {
 208    #[inline(always)]
 209    pub fn is_empty(&self) -> bool {
 210        self.text.is_empty()
 211    }
 212
 213    #[inline(always)]
 214    pub fn is_char_boundary(&self, offset: usize) -> bool {
 215        (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
 216    }
 217
 218    #[inline(always)]
 219    pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
 220        if mid == MAX_BASE {
 221            let left = self;
 222            let right = ChunkSlice {
 223                chars: 0,
 224                chars_utf16: 0,
 225                newlines: 0,
 226                tabs: 0,
 227                text: "",
 228            };
 229            (left, right)
 230        } else {
 231            let mask = ((1 as Bitmap) << mid) - 1;
 232            let (left_text, right_text) = self.text.split_at(mid);
 233            let left = ChunkSlice {
 234                chars: self.chars & mask,
 235                chars_utf16: self.chars_utf16 & mask,
 236                newlines: self.newlines & mask,
 237                tabs: self.tabs & mask,
 238                text: left_text,
 239            };
 240            let right = ChunkSlice {
 241                chars: self.chars >> mid,
 242                chars_utf16: self.chars_utf16 >> mid,
 243                newlines: self.newlines >> mid,
 244                tabs: self.tabs >> mid,
 245                text: right_text,
 246            };
 247            (left, right)
 248        }
 249    }
 250
 251    #[inline(always)]
 252    pub fn slice(self, mut range: Range<usize>) -> Self {
 253        if range.start == MAX_BASE {
 254            Self {
 255                chars: 0,
 256                chars_utf16: 0,
 257                newlines: 0,
 258                tabs: 0,
 259                text: "",
 260            }
 261        } else {
 262            if !self.assert_char_boundary::<false>(range.start) {
 263                range.start = self.text.ceil_char_boundary(range.start);
 264            }
 265            if !self.assert_char_boundary::<false>(range.end) {
 266                range.end = if range.end < range.start {
 267                    range.start
 268                } else {
 269                    self.text.floor_char_boundary(range.end)
 270                };
 271            }
 272            let mask = (1 as Bitmap)
 273                .unbounded_shl(range.end as u32)
 274                .wrapping_sub(1);
 275            Self {
 276                chars: (self.chars & mask) >> range.start,
 277                chars_utf16: (self.chars_utf16 & mask) >> range.start,
 278                newlines: (self.newlines & mask) >> range.start,
 279                tabs: (self.tabs & mask) >> range.start,
 280                text: &self.text[range],
 281            }
 282        }
 283    }
 284
 285    #[inline(always)]
 286    pub fn text_summary(&self) -> TextSummary {
 287        let mut chars = 0;
 288        let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
 289        TextSummary {
 290            len: self.len(),
 291            chars,
 292            len_utf16: self.len_utf16(),
 293            lines: self.lines(),
 294            first_line_chars: self.first_line_chars(),
 295            last_line_chars: self.last_line_chars(),
 296            last_line_len_utf16: self.last_line_len_utf16(),
 297            longest_row,
 298            longest_row_chars,
 299        }
 300    }
 301
 302    /// Get length in bytes
 303    #[inline(always)]
 304    pub fn len(&self) -> usize {
 305        self.text.len()
 306    }
 307
 308    /// Get length in UTF-16 code units
 309    #[inline(always)]
 310    pub fn len_utf16(&self) -> OffsetUtf16 {
 311        OffsetUtf16(self.chars_utf16.count_ones() as usize)
 312    }
 313
 314    /// Get point representing number of lines and length of last line
 315    #[inline(always)]
 316    pub fn lines(&self) -> Point {
 317        let row = self.newlines.count_ones();
 318        let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
 319        Point::new(row, column)
 320    }
 321
 322    /// Get number of chars in first line
 323    #[inline(always)]
 324    pub fn first_line_chars(&self) -> u32 {
 325        (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
 326    }
 327
 328    /// Get number of chars in last line
 329    #[inline(always)]
 330    pub fn last_line_chars(&self) -> u32 {
 331        (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 332    }
 333
 334    /// Get number of UTF-16 code units in last line
 335    #[inline(always)]
 336    pub fn last_line_len_utf16(&self) -> u32 {
 337        (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
 338    }
 339
 340    /// Get the longest row in the chunk and its length in characters.
 341    /// Calculate the total number of characters in the chunk along the way.
 342    #[inline(always)]
 343    pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
 344        let mut chars = self.chars;
 345        let mut newlines = self.newlines;
 346        *total_chars = 0;
 347        let mut row = 0;
 348        let mut longest_row = 0;
 349        let mut longest_row_chars = 0;
 350        while newlines > 0 {
 351            let newline_ix = newlines.trailing_zeros();
 352            let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
 353            *total_chars += usize::from(row_chars);
 354            if row_chars > longest_row_chars {
 355                longest_row = row;
 356                longest_row_chars = row_chars;
 357            }
 358
 359            newlines >>= newline_ix;
 360            newlines >>= 1;
 361            chars >>= newline_ix;
 362            chars >>= 1;
 363            row += 1;
 364            *total_chars += 1;
 365        }
 366
 367        let row_chars = chars.count_ones() as u8;
 368        *total_chars += usize::from(row_chars);
 369        if row_chars > longest_row_chars {
 370            (row, row_chars as u32)
 371        } else {
 372            (longest_row, longest_row_chars as u32)
 373        }
 374    }
 375
 376    #[inline(always)]
 377    pub fn offset_to_point(&self, offset: usize) -> Point {
 378        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 379        let row = (self.newlines & mask).count_ones();
 380        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 381        let column = (offset - newline_ix as usize) as u32;
 382        Point::new(row, column)
 383    }
 384
 385    #[inline(always)]
 386    pub fn point_to_offset(&self, point: Point) -> usize {
 387        if point.row > self.lines().row {
 388            debug_panic!(
 389                "point {:?} extends beyond rows for string {:?}",
 390                point,
 391                self.text
 392            );
 393            return self.len();
 394        }
 395
 396        let row_offset_range = self.offset_range_for_row(point.row);
 397        if point.column > row_offset_range.len() as u32 {
 398            debug_panic!(
 399                "point {:?} extends beyond row for string {:?}",
 400                point,
 401                self.text
 402            );
 403            row_offset_range.end
 404        } else {
 405            row_offset_range.start + point.column as usize
 406        }
 407    }
 408
 409    #[track_caller]
 410    #[inline(always)]
 411    pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
 412        if self.is_char_boundary(offset) {
 413            return true;
 414        }
 415        if PANIC {
 416            panic_char_boundary(self.text, offset);
 417        } else {
 418            log_err_char_boundary(self.text, offset);
 419            false
 420        }
 421    }
 422
 423    pub fn floor_char_boundary(&self, index: usize) -> usize {
 424        self.text.floor_char_boundary(index)
 425    }
 426
 427    #[inline(always)]
 428    pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
 429        if point.row > self.lines().row {
 430            debug_panic!(
 431                "point {:?} extends beyond rows for string {:?}",
 432                point,
 433                self.text
 434            );
 435            return self.len_utf16();
 436        }
 437        self.offset_to_offset_utf16(self.point_to_offset(point))
 438    }
 439
 440    #[inline(always)]
 441    pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
 442        let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
 443        OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
 444    }
 445
 446    #[inline(always)]
 447    pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
 448        if target.0 == 0 {
 449            0
 450        } else {
 451            #[cfg(not(test))]
 452            let chars_utf16 = self.chars_utf16;
 453            #[cfg(test)]
 454            let chars_utf16 = self.chars_utf16 as u128;
 455            let ix = nth_set_bit(chars_utf16, target.0) + 1;
 456            if ix == MAX_BASE {
 457                MAX_BASE
 458            } else {
 459                let utf8_additional_len = cmp::min(
 460                    (self.chars_utf16 >> ix).trailing_zeros() as usize,
 461                    self.text.len() - ix,
 462                );
 463                ix + utf8_additional_len
 464            }
 465        }
 466    }
 467
 468    #[inline(always)]
 469    pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
 470        let mask = saturating_shl_mask(offset as u32);
 471        let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
 472        let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
 473        let column = if newline_ix as usize == MAX_BASE {
 474            0
 475        } else {
 476            ((self.chars_utf16 & mask) >> newline_ix).count_ones()
 477        };
 478        PointUtf16::new(row, column)
 479    }
 480
 481    #[inline(always)]
 482    pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
 483        self.offset_to_point_utf16(self.point_to_offset(point))
 484    }
 485
 486    #[inline(always)]
 487    pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
 488        let lines = self.lines();
 489        if point.row > lines.row {
 490            if !clip {
 491                debug_panic!(
 492                    "point {:?} is beyond this chunk's extent {:?}",
 493                    point,
 494                    self.text
 495                );
 496            }
 497            return self.len();
 498        }
 499
 500        let row_offset_range = self.offset_range_for_row(point.row);
 501        let line = self.slice(row_offset_range.clone());
 502        if point.column > line.last_line_len_utf16() {
 503            if !clip {
 504                debug_panic!(
 505                    "point {:?} is beyond the end of the line in chunk {:?}",
 506                    point,
 507                    self.text
 508                );
 509            }
 510            return row_offset_range.end;
 511        }
 512
 513        let mut offset = row_offset_range.start;
 514        if point.column > 0 {
 515            offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
 516            if !self.text.is_char_boundary(offset) {
 517                offset -= 1;
 518                while !self.text.is_char_boundary(offset) {
 519                    offset -= 1;
 520                }
 521                if !clip {
 522                    debug_panic!(
 523                        "point {:?} is within character in chunk {:?}",
 524                        point,
 525                        self.text,
 526                    );
 527                }
 528            }
 529        }
 530        offset
 531    }
 532
 533    #[inline(always)]
 534    pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
 535        let max_point = self.lines();
 536        if point.0.row > max_point.row {
 537            return max_point;
 538        }
 539
 540        let row_offset_range = self.offset_range_for_row(point.0.row);
 541        let line = self.slice(row_offset_range);
 542        if point.0.column == 0 {
 543            Point::new(point.0.row, 0)
 544        } else if point.0.column >= line.len_utf16().0 as u32 {
 545            Point::new(point.0.row, line.len() as u32)
 546        } else {
 547            let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
 548            while !line.text.is_char_boundary(column) {
 549                column -= 1;
 550            }
 551            Point::new(point.0.row, column as u32)
 552        }
 553    }
 554
 555    #[inline(always)]
 556    pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
 557        let max_point = self.lines();
 558        if point.row > max_point.row {
 559            return max_point;
 560        }
 561
 562        let line = self.slice(self.offset_range_for_row(point.row));
 563        if point.column == 0 {
 564            point
 565        } else if point.column >= line.len() as u32 {
 566            Point::new(point.row, line.len() as u32)
 567        } else {
 568            let mut column = point.column as usize;
 569            let bytes = line.text.as_bytes();
 570            if bytes[column - 1] < 128 && bytes[column] < 128 {
 571                return Point::new(point.row, column as u32);
 572            }
 573
 574            let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
 575            loop {
 576                if line.is_char_boundary(column)
 577                    && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
 578                {
 579                    break;
 580                }
 581
 582                match bias {
 583                    Bias::Left => column -= 1,
 584                    Bias::Right => column += 1,
 585                }
 586                grapheme_cursor.set_cursor(column);
 587            }
 588            Point::new(point.row, column as u32)
 589        }
 590    }
 591
 592    #[inline(always)]
 593    pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
 594        let max_point = self.lines();
 595        if point.0.row > max_point.row {
 596            PointUtf16::new(max_point.row, self.last_line_len_utf16())
 597        } else {
 598            let line = self.slice(self.offset_range_for_row(point.0.row));
 599            let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
 600            PointUtf16::new(point.0.row, column.0 as u32)
 601        }
 602    }
 603
 604    #[inline(always)]
 605    pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
 606        if target == OffsetUtf16::default() {
 607            OffsetUtf16::default()
 608        } else if target >= self.len_utf16() {
 609            self.len_utf16()
 610        } else {
 611            let mut offset = self.offset_utf16_to_offset(target);
 612            while !self.text.is_char_boundary(offset) {
 613                if bias == Bias::Left {
 614                    offset -= 1;
 615                } else {
 616                    offset += 1;
 617                }
 618            }
 619            self.offset_to_offset_utf16(offset)
 620        }
 621    }
 622
 623    #[inline(always)]
 624    fn offset_range_for_row(&self, row: u32) -> Range<usize> {
 625        let row_start = if row > 0 {
 626            #[cfg(not(test))]
 627            let newlines = self.newlines;
 628            #[cfg(test)]
 629            let newlines = self.newlines as u128;
 630            nth_set_bit(newlines, row as usize) + 1
 631        } else {
 632            0
 633        };
 634        let row_len = if row_start == MAX_BASE {
 635            0
 636        } else {
 637            cmp::min(
 638                (self.newlines >> row_start).trailing_zeros(),
 639                (self.text.len() - row_start) as u32,
 640            )
 641        };
 642        row_start..row_start + row_len as usize
 643    }
 644
 645    #[inline(always)]
 646    pub fn tabs(&self) -> Tabs {
 647        Tabs {
 648            tabs: self.tabs,
 649            chars: self.chars,
 650        }
 651    }
 652}
 653
 654pub struct Tabs {
 655    tabs: Bitmap,
 656    chars: Bitmap,
 657}
 658
 659#[derive(Debug, PartialEq, Eq)]
 660pub struct TabPosition {
 661    pub byte_offset: usize,
 662    pub char_offset: usize,
 663}
 664
 665impl Iterator for Tabs {
 666    type Item = TabPosition;
 667
 668    fn next(&mut self) -> Option<Self::Item> {
 669        if self.tabs == 0 {
 670            return None;
 671        }
 672
 673        let tab_offset = self.tabs.trailing_zeros() as usize;
 674        let chars_mask = (1 << tab_offset) - 1;
 675        let char_offset = (self.chars & chars_mask).count_ones() as usize;
 676
 677        // Since tabs are 1 byte the tab offset is the same as the byte offset
 678        let position = TabPosition {
 679            byte_offset: tab_offset,
 680            char_offset,
 681        };
 682        // Remove the tab we've just seen
 683        self.tabs ^= 1 << tab_offset;
 684
 685        Some(position)
 686    }
 687}
 688
 689/// Finds the n-th bit that is set to 1.
 690#[inline(always)]
 691fn nth_set_bit(v: u128, n: usize) -> usize {
 692    let low = v as u64;
 693    let high = (v >> 64) as u64;
 694
 695    let low_count = low.count_ones() as usize;
 696    if n > low_count {
 697        64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
 698    } else {
 699        nth_set_bit_u64(low, n as u64) as usize
 700    }
 701}
 702
 703#[cold]
 704#[inline(never)]
 705#[track_caller]
 706fn panic_char_boundary(text: &str, offset: usize) -> ! {
 707    if offset > text.len() {
 708        panic!(
 709            "byte index {} is out of bounds of `{:?}` (length: {})",
 710            offset,
 711            text,
 712            text.len()
 713        );
 714    }
 715    // find the character
 716    let char_start = text.floor_char_boundary(offset);
 717    // `char_start` must be less than len and a char boundary
 718    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 719    let char_range = char_start..char_start + ch.len_utf8();
 720    panic!(
 721        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 722        offset, ch, char_range,
 723    );
 724}
 725
 726#[cold]
 727#[inline(never)]
 728#[track_caller]
 729fn log_err_char_boundary(text: &str, offset: usize) {
 730    if offset >= text.len() {
 731        log::error!(
 732            "byte index {} is out of bounds of `{:?}` (length: {})",
 733            offset,
 734            text,
 735            text.len()
 736        );
 737        return;
 738    }
 739    // find the character
 740    let char_start = text.floor_char_boundary(offset);
 741    // `char_start` must be less than len and a char boundary
 742    let ch = text.get(char_start..).unwrap().chars().next().unwrap();
 743    let char_range = char_start..char_start + ch.len_utf8();
 744    log::error!(
 745        "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
 746        offset,
 747        ch,
 748        char_range,
 749    );
 750}
 751
 752#[inline(always)]
 753fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
 754    let v = v.reverse_bits();
 755    let mut s: u64 = 64;
 756
 757    // Parallel bit count intermediates
 758    let a = v - ((v >> 1) & (u64::MAX / 3));
 759    let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
 760    let c = (b + (b >> 4)) & (u64::MAX / 0x11);
 761    let d = (c + (c >> 8)) & (u64::MAX / 0x101);
 762
 763    // Branchless select
 764    let t = (d >> 32) + (d >> 48);
 765    s -= (t.wrapping_sub(n) & 256) >> 3;
 766    n -= t & (t.wrapping_sub(n) >> 8);
 767
 768    let t = (d >> (s - 16)) & 0xff;
 769    s -= (t.wrapping_sub(n) & 256) >> 4;
 770    n -= t & (t.wrapping_sub(n) >> 8);
 771
 772    let t = (c >> (s - 8)) & 0xf;
 773    s -= (t.wrapping_sub(n) & 256) >> 5;
 774    n -= t & (t.wrapping_sub(n) >> 8);
 775
 776    let t = (b >> (s - 4)) & 0x7;
 777    s -= (t.wrapping_sub(n) & 256) >> 6;
 778    n -= t & (t.wrapping_sub(n) >> 8);
 779
 780    let t = (a >> (s - 2)) & 0x3;
 781    s -= (t.wrapping_sub(n) & 256) >> 7;
 782    n -= t & (t.wrapping_sub(n) >> 8);
 783
 784    let t = (v >> (s - 1)) & 0x1;
 785    s -= (t.wrapping_sub(n) & 256) >> 8;
 786
 787    65 - s - 1
 788}
 789
 790#[cfg(test)]
 791mod tests {
 792    use super::*;
 793    use rand::prelude::*;
 794    use util::RandomCharIter;
 795
 796    #[gpui::test(iterations = 100)]
 797    fn test_random_chunks(mut rng: StdRng) {
 798        let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
 799        log::info!("Chunk: {:?}", text);
 800        let chunk = Chunk::new(&text);
 801        verify_chunk(chunk.as_slice(), &text);
 802
 803        // Verify Chunk::chars() bitmap
 804        let expected_chars = char_offsets(&text)
 805            .into_iter()
 806            .inspect(|i| assert!(*i < MAX_BASE))
 807            .fold(0 as Bitmap, |acc, i| acc | (1 << i));
 808        assert_eq!(chunk.chars(), expected_chars);
 809
 810        for _ in 0..10 {
 811            let mut start = rng.random_range(0..=chunk.text.len());
 812            let mut end = rng.random_range(start..=chunk.text.len());
 813            while !chunk.text.is_char_boundary(start) {
 814                start -= 1;
 815            }
 816            while !chunk.text.is_char_boundary(end) {
 817                end -= 1;
 818            }
 819            let range = start..end;
 820            log::info!("Range: {:?}", range);
 821            let text_slice = &text[range.clone()];
 822            let chunk_slice = chunk.slice(range);
 823            verify_chunk(chunk_slice, text_slice);
 824        }
 825    }
 826
 827    #[gpui::test(iterations = 100)]
 828    fn test_split_chunk_slice(mut rng: StdRng) {
 829        let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
 830        let chunk = Chunk::new(text);
 831        let offset = char_offsets_with_end(text)
 832            .into_iter()
 833            .choose(&mut rng)
 834            .unwrap();
 835        let (a, b) = chunk.as_slice().split_at(offset);
 836        let (a_str, b_str) = text.split_at(offset);
 837        verify_chunk(a, a_str);
 838        verify_chunk(b, b_str);
 839    }
 840
 841    #[gpui::test(iterations = 1000)]
 842    fn test_nth_set_bit_random(mut rng: StdRng) {
 843        let set_count = rng.random_range(0..=128);
 844        let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
 845        set_bits.sort();
 846        let mut n = 0;
 847        for ix in set_bits.iter().copied() {
 848            n |= 1 << ix;
 849        }
 850
 851        for (mut ix, position) in set_bits.into_iter().enumerate() {
 852            ix += 1;
 853            assert_eq!(
 854                nth_set_bit(n, ix),
 855                position,
 856                "nth_set_bit({:0128b}, {})",
 857                n,
 858                ix
 859            );
 860        }
 861    }
 862
 863    /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
 864    fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
 865        let mut str = String::new();
 866        let mut chars = RandomCharIter::new(rng);
 867        loop {
 868            let ch = chars.next().unwrap();
 869            if str.len() + ch.len_utf8() > len {
 870                break;
 871            }
 872            str.push(ch);
 873        }
 874        str
 875    }
 876
 877    #[gpui::test(iterations = 1000)]
 878    fn test_append_random_strings(mut rng: StdRng) {
 879        let len1 = rng.random_range(0..=MAX_BASE);
 880        let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
 881        let str1 = random_string_with_utf8_len(&mut rng, len1);
 882        let str2 = random_string_with_utf8_len(&mut rng, len2);
 883        let mut chunk1 = Chunk::new(&str1);
 884        let chunk2 = Chunk::new(&str2);
 885        let char_offsets = char_offsets_with_end(&str2);
 886        let start_index = rng.random_range(0..char_offsets.len());
 887        let start_offset = char_offsets[start_index];
 888        let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
 889        chunk1.append(chunk2.slice(start_offset..end_offset));
 890        verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
 891    }
 892
 893    /// Return the byte offsets for each character in a string.
 894    ///
 895    /// These are valid offsets to split the string.
 896    fn char_offsets(text: &str) -> Vec<usize> {
 897        text.char_indices().map(|(i, _c)| i).collect()
 898    }
 899
 900    /// Return the byte offsets for each character in a string, plus the offset
 901    /// past the end of the string.
 902    fn char_offsets_with_end(text: &str) -> Vec<usize> {
 903        let mut v = char_offsets(text);
 904        v.push(text.len());
 905        v
 906    }
 907
 908    fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
 909        let mut offset = 0;
 910        let mut offset_utf16 = OffsetUtf16(0);
 911        let mut point = Point::zero();
 912        let mut point_utf16 = PointUtf16::zero();
 913
 914        log::info!("Verifying chunk {:?}", text);
 915        assert_eq!(chunk.offset_to_point(0), Point::zero());
 916
 917        let mut expected_tab_positions = Vec::new();
 918
 919        for (char_offset, c) in text.chars().enumerate() {
 920            let expected_point = chunk.offset_to_point(offset);
 921            assert_eq!(point, expected_point, "mismatch at offset {}", offset);
 922            assert_eq!(
 923                chunk.point_to_offset(point),
 924                offset,
 925                "mismatch at point {:?}",
 926                point
 927            );
 928            assert_eq!(
 929                chunk.offset_to_offset_utf16(offset),
 930                offset_utf16,
 931                "mismatch at offset {}",
 932                offset
 933            );
 934            assert_eq!(
 935                chunk.offset_utf16_to_offset(offset_utf16),
 936                offset,
 937                "mismatch at offset_utf16 {:?}",
 938                offset_utf16
 939            );
 940            assert_eq!(
 941                chunk.point_to_point_utf16(point),
 942                point_utf16,
 943                "mismatch at point {:?}",
 944                point
 945            );
 946            assert_eq!(
 947                chunk.point_utf16_to_offset(point_utf16, false),
 948                offset,
 949                "mismatch at point_utf16 {:?}",
 950                point_utf16
 951            );
 952            assert_eq!(
 953                chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
 954                point,
 955                "mismatch for unclipped_point_utf16_to_point at {:?}",
 956                point_utf16
 957            );
 958
 959            assert_eq!(
 960                chunk.clip_point(point, Bias::Left),
 961                point,
 962                "incorrect left clip at {:?}",
 963                point
 964            );
 965            assert_eq!(
 966                chunk.clip_point(point, Bias::Right),
 967                point,
 968                "incorrect right clip at {:?}",
 969                point
 970            );
 971
 972            for i in 1..c.len_utf8() {
 973                let test_point = Point::new(point.row, point.column + i as u32);
 974                assert_eq!(
 975                    chunk.clip_point(test_point, Bias::Left),
 976                    point,
 977                    "incorrect left clip within multi-byte char at {:?}",
 978                    test_point
 979                );
 980                assert_eq!(
 981                    chunk.clip_point(test_point, Bias::Right),
 982                    Point::new(point.row, point.column + c.len_utf8() as u32),
 983                    "incorrect right clip within multi-byte char at {:?}",
 984                    test_point
 985                );
 986            }
 987
 988            for i in 1..c.len_utf16() {
 989                let test_point = Unclipped(PointUtf16::new(
 990                    point_utf16.row,
 991                    point_utf16.column + i as u32,
 992                ));
 993                assert_eq!(
 994                    chunk.unclipped_point_utf16_to_point(test_point),
 995                    point,
 996                    "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
 997                    test_point
 998                );
 999                assert_eq!(
1000                    chunk.clip_point_utf16(test_point, Bias::Left),
1001                    point_utf16,
1002                    "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1003                    test_point
1004                );
1005                assert_eq!(
1006                    chunk.clip_point_utf16(test_point, Bias::Right),
1007                    PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1008                    "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1009                    test_point
1010                );
1011
1012                let test_offset = OffsetUtf16(offset_utf16.0 + i);
1013                assert_eq!(
1014                    chunk.clip_offset_utf16(test_offset, Bias::Left),
1015                    offset_utf16,
1016                    "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1017                    test_offset
1018                );
1019                assert_eq!(
1020                    chunk.clip_offset_utf16(test_offset, Bias::Right),
1021                    OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1022                    "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1023                    test_offset
1024                );
1025            }
1026
1027            if c == '\n' {
1028                point.row += 1;
1029                point.column = 0;
1030                point_utf16.row += 1;
1031                point_utf16.column = 0;
1032            } else {
1033                point.column += c.len_utf8() as u32;
1034                point_utf16.column += c.len_utf16() as u32;
1035            }
1036
1037            if c == '\t' {
1038                expected_tab_positions.push(TabPosition {
1039                    byte_offset: offset,
1040                    char_offset,
1041                });
1042            }
1043
1044            offset += c.len_utf8();
1045            offset_utf16.0 += c.len_utf16();
1046        }
1047
1048        let final_point = chunk.offset_to_point(offset);
1049        assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1050        assert_eq!(
1051            chunk.point_to_offset(point),
1052            offset,
1053            "mismatch at point {:?}",
1054            point
1055        );
1056        assert_eq!(
1057            chunk.offset_to_offset_utf16(offset),
1058            offset_utf16,
1059            "mismatch at offset {}",
1060            offset
1061        );
1062        assert_eq!(
1063            chunk.offset_utf16_to_offset(offset_utf16),
1064            offset,
1065            "mismatch at offset_utf16 {:?}",
1066            offset_utf16
1067        );
1068        assert_eq!(
1069            chunk.point_to_point_utf16(point),
1070            point_utf16,
1071            "mismatch at final point {:?}",
1072            point
1073        );
1074        assert_eq!(
1075            chunk.point_utf16_to_offset(point_utf16, false),
1076            offset,
1077            "mismatch at final point_utf16 {:?}",
1078            point_utf16
1079        );
1080        assert_eq!(
1081            chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1082            point,
1083            "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1084            point_utf16
1085        );
1086        assert_eq!(
1087            chunk.clip_point(point, Bias::Left),
1088            point,
1089            "incorrect left clip at final point {:?}",
1090            point
1091        );
1092        assert_eq!(
1093            chunk.clip_point(point, Bias::Right),
1094            point,
1095            "incorrect right clip at final point {:?}",
1096            point
1097        );
1098        assert_eq!(
1099            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1100            point_utf16,
1101            "incorrect left clip_point_utf16 at final point {:?}",
1102            point_utf16
1103        );
1104        assert_eq!(
1105            chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1106            point_utf16,
1107            "incorrect right clip_point_utf16 at final point {:?}",
1108            point_utf16
1109        );
1110        assert_eq!(
1111            chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1112            offset_utf16,
1113            "incorrect left clip_offset_utf16 at final offset {:?}",
1114            offset_utf16
1115        );
1116        assert_eq!(
1117            chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1118            offset_utf16,
1119            "incorrect right clip_offset_utf16 at final offset {:?}",
1120            offset_utf16
1121        );
1122
1123        // Verify length methods
1124        assert_eq!(chunk.len(), text.len());
1125        assert_eq!(
1126            chunk.len_utf16().0,
1127            text.chars().map(|c| c.len_utf16()).sum::<usize>()
1128        );
1129
1130        // Verify line counting
1131        let lines = chunk.lines();
1132        let mut newline_count = 0;
1133        let mut last_line_len = 0;
1134        for c in text.chars() {
1135            if c == '\n' {
1136                newline_count += 1;
1137                last_line_len = 0;
1138            } else {
1139                last_line_len += c.len_utf8() as u32;
1140            }
1141        }
1142        assert_eq!(lines, Point::new(newline_count, last_line_len));
1143
1144        // Verify first/last line chars
1145        if !text.is_empty() {
1146            let first_line = text.split('\n').next().unwrap();
1147            assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1148
1149            let last_line = text.split('\n').next_back().unwrap();
1150            assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1151            assert_eq!(
1152                chunk.last_line_len_utf16(),
1153                last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1154            );
1155        }
1156
1157        // Verify longest row
1158        let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1159        let mut max_chars = 0;
1160        let mut current_row = 0;
1161        let mut current_chars = 0;
1162        let mut max_row = 0;
1163
1164        for c in text.chars() {
1165            if c == '\n' {
1166                if current_chars > max_chars {
1167                    max_chars = current_chars;
1168                    max_row = current_row;
1169                }
1170                current_row += 1;
1171                current_chars = 0;
1172            } else {
1173                current_chars += 1;
1174            }
1175        }
1176
1177        if current_chars > max_chars {
1178            max_chars = current_chars;
1179            max_row = current_row;
1180        }
1181
1182        assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1183        assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1184    }
1185
1186    #[gpui::test]
1187    fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1188        let text = "abc\nde";
1189        let chunk = Chunk::new(text);
1190        let slice = chunk.as_slice();
1191
1192        // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1193        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1194
1195        // Clipping on row 1 — this is the case that was buggy.
1196        // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1197        // clipped result must be 6, not 2.
1198        assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1199    }
1200}