1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let text = ArrayString::from(text).unwrap();
51
52 const CHUNK_SIZE: usize = 8;
53
54 let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
55 let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
56 let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
57 let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
58
59 let mut chunk_ix = 0;
60
61 let mut bytes = text.as_bytes();
62 while !bytes.is_empty() {
63 let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
64 bytes = rest;
65
66 let mut chars = 0;
67 let mut newlines = 0;
68 let mut tabs = 0;
69 let mut chars_utf16 = 0;
70
71 for (ix, &b) in chunk.iter().enumerate() {
72 chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
73 newlines |= ((b == b'\n') as u8) << ix;
74 tabs |= ((b == b'\t') as u8) << ix;
75 // b >= 240 when we are at the first byte of the 4 byte encoded
76 // utf-8 code point (U+010000 or greater) it means that it would
77 // be encoded as two 16-bit code units in utf-16
78 chars_utf16 |= ((b >= 240) as u8) << ix;
79 }
80
81 chars_bytes[chunk_ix] = chars;
82 newlines_bytes[chunk_ix] = newlines;
83 tabs_bytes[chunk_ix] = tabs;
84 chars_utf16_bytes[chunk_ix] = chars_utf16;
85
86 chunk_ix += 1;
87 }
88
89 let chars = Bitmap::from_le_bytes(chars_bytes);
90
91 Chunk {
92 text,
93 chars,
94 chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
95 newlines: Bitmap::from_le_bytes(newlines_bytes),
96 tabs: Bitmap::from_le_bytes(tabs_bytes),
97 }
98 }
99
100 #[inline(always)]
101 pub fn push_str(&mut self, text: &str) {
102 self.append(Chunk::new(text).as_slice());
103 }
104
105 #[inline(always)]
106 pub fn prepend_str(&mut self, text: &str) {
107 self.prepend(Chunk::new(text).as_slice());
108 }
109
110 #[inline(always)]
111 pub fn append(&mut self, slice: ChunkSlice) {
112 if slice.is_empty() {
113 return;
114 };
115
116 let base_ix = self.text.len();
117 self.chars |= slice.chars << base_ix;
118 self.chars_utf16 |= slice.chars_utf16 << base_ix;
119 self.newlines |= slice.newlines << base_ix;
120 self.tabs |= slice.tabs << base_ix;
121 self.text.push_str(slice.text);
122 }
123
124 #[inline(always)]
125 pub fn prepend(&mut self, slice: ChunkSlice) {
126 if slice.is_empty() {
127 return;
128 }
129 if self.text.is_empty() {
130 *self = Chunk::new(slice.text);
131 return;
132 }
133
134 let shift = slice.text.len();
135 self.chars = slice.chars | (self.chars << shift);
136 self.chars_utf16 = slice.chars_utf16 | (self.chars_utf16 << shift);
137 self.newlines = slice.newlines | (self.newlines << shift);
138 self.tabs = slice.tabs | (self.tabs << shift);
139
140 let mut new_text = ArrayString::<MAX_BASE>::new();
141 new_text.push_str(slice.text);
142 new_text.push_str(&self.text);
143 self.text = new_text;
144 }
145
146 #[inline(always)]
147 pub fn as_slice(&self) -> ChunkSlice<'_> {
148 ChunkSlice {
149 chars: self.chars,
150 chars_utf16: self.chars_utf16,
151 newlines: self.newlines,
152 tabs: self.tabs,
153 text: &self.text,
154 }
155 }
156
157 #[inline(always)]
158 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
159 self.as_slice().slice(range)
160 }
161
162 #[inline(always)]
163 pub fn chars(&self) -> Bitmap {
164 self.chars
165 }
166
167 #[inline(always)]
168 pub fn tabs(&self) -> Bitmap {
169 self.tabs
170 }
171
172 #[inline(always)]
173 pub fn newlines(&self) -> Bitmap {
174 self.newlines
175 }
176
177 #[inline(always)]
178 pub fn is_char_boundary(&self, offset: usize) -> bool {
179 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
180 }
181
182 pub fn floor_char_boundary(&self, index: usize) -> usize {
183 if index >= self.text.len() {
184 self.text.len()
185 } else {
186 let mut i = index;
187 while i > 0 {
188 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
189 break;
190 }
191 i -= 1;
192 }
193
194 i
195 }
196 }
197
198 #[track_caller]
199 #[inline(always)]
200 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
201 if self.is_char_boundary(offset) {
202 return true;
203 }
204 if PANIC || cfg!(debug_assertions) {
205 panic_char_boundary(&self.text, offset);
206 } else {
207 log_err_char_boundary(&self.text, offset);
208 false
209 }
210 }
211}
212
213#[derive(Clone, Copy, Debug)]
214pub struct ChunkSlice<'a> {
215 chars: Bitmap,
216 chars_utf16: Bitmap,
217 newlines: Bitmap,
218 tabs: Bitmap,
219 text: &'a str,
220}
221
222impl Into<Chunk> for ChunkSlice<'_> {
223 fn into(self) -> Chunk {
224 Chunk {
225 chars: self.chars,
226 chars_utf16: self.chars_utf16,
227 newlines: self.newlines,
228 tabs: self.tabs,
229 text: self.text.try_into().unwrap(),
230 }
231 }
232}
233
234impl<'a> ChunkSlice<'a> {
235 #[inline(always)]
236 pub fn is_empty(&self) -> bool {
237 self.text.is_empty()
238 }
239
240 #[inline(always)]
241 pub fn is_char_boundary(&self, offset: usize) -> bool {
242 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
243 }
244
245 #[inline(always)]
246 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
247 if mid == MAX_BASE {
248 let left = self;
249 let right = ChunkSlice {
250 chars: 0,
251 chars_utf16: 0,
252 newlines: 0,
253 tabs: 0,
254 text: "",
255 };
256 (left, right)
257 } else {
258 let mask = ((1 as Bitmap) << mid) - 1;
259 let (left_text, right_text) = self.text.split_at(mid);
260 let left = ChunkSlice {
261 chars: self.chars & mask,
262 chars_utf16: self.chars_utf16 & mask,
263 newlines: self.newlines & mask,
264 tabs: self.tabs & mask,
265 text: left_text,
266 };
267 let right = ChunkSlice {
268 chars: self.chars >> mid,
269 chars_utf16: self.chars_utf16 >> mid,
270 newlines: self.newlines >> mid,
271 tabs: self.tabs >> mid,
272 text: right_text,
273 };
274 (left, right)
275 }
276 }
277
278 #[inline(always)]
279 pub fn slice(self, mut range: Range<usize>) -> Self {
280 if range.start == MAX_BASE {
281 Self {
282 chars: 0,
283 chars_utf16: 0,
284 newlines: 0,
285 tabs: 0,
286 text: "",
287 }
288 } else {
289 if !self.assert_char_boundary::<false>(range.start) {
290 range.start = self.text.ceil_char_boundary(range.start);
291 }
292 if !self.assert_char_boundary::<false>(range.end) {
293 range.end = if range.end < range.start {
294 range.start
295 } else {
296 self.text.floor_char_boundary(range.end)
297 };
298 }
299 let mask = (1 as Bitmap)
300 .unbounded_shl(range.end as u32)
301 .wrapping_sub(1);
302 Self {
303 chars: (self.chars & mask) >> range.start,
304 chars_utf16: (self.chars_utf16 & mask) >> range.start,
305 newlines: (self.newlines & mask) >> range.start,
306 tabs: (self.tabs & mask) >> range.start,
307 text: &self.text[range],
308 }
309 }
310 }
311
312 #[inline(always)]
313 pub fn text_summary(&self) -> TextSummary {
314 let mut chars = 0;
315 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
316 TextSummary {
317 len: self.len(),
318 chars,
319 len_utf16: self.len_utf16(),
320 lines: self.lines(),
321 first_line_chars: self.first_line_chars(),
322 last_line_chars: self.last_line_chars(),
323 last_line_len_utf16: self.last_line_len_utf16(),
324 longest_row,
325 longest_row_chars,
326 }
327 }
328
329 /// Get length in bytes
330 #[inline(always)]
331 pub fn len(&self) -> usize {
332 self.text.len()
333 }
334
335 /// Get length in UTF-16 code units
336 #[inline(always)]
337 pub fn len_utf16(&self) -> OffsetUtf16 {
338 OffsetUtf16(self.chars_utf16.count_ones() as usize)
339 }
340
341 /// Get point representing number of lines and length of last line
342 #[inline(always)]
343 pub fn lines(&self) -> Point {
344 let row = self.newlines.count_ones();
345 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
346 Point::new(row, column)
347 }
348
349 /// Get number of chars in first line
350 #[inline(always)]
351 pub fn first_line_chars(&self) -> u32 {
352 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
353 }
354
355 /// Get number of chars in last line
356 #[inline(always)]
357 pub fn last_line_chars(&self) -> u32 {
358 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
359 }
360
361 /// Get number of UTF-16 code units in last line
362 #[inline(always)]
363 pub fn last_line_len_utf16(&self) -> u32 {
364 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
365 }
366
367 /// Get the longest row in the chunk and its length in characters.
368 /// Calculate the total number of characters in the chunk along the way.
369 #[inline(always)]
370 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
371 let mut chars = self.chars;
372 let mut newlines = self.newlines;
373 *total_chars = 0;
374 let mut row = 0;
375 let mut longest_row = 0;
376 let mut longest_row_chars = 0;
377 while newlines > 0 {
378 let newline_ix = newlines.trailing_zeros();
379 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
380 *total_chars += usize::from(row_chars);
381 if row_chars > longest_row_chars {
382 longest_row = row;
383 longest_row_chars = row_chars;
384 }
385
386 newlines >>= newline_ix;
387 newlines >>= 1;
388 chars >>= newline_ix;
389 chars >>= 1;
390 row += 1;
391 *total_chars += 1;
392 }
393
394 let row_chars = chars.count_ones() as u8;
395 *total_chars += usize::from(row_chars);
396 if row_chars > longest_row_chars {
397 (row, row_chars as u32)
398 } else {
399 (longest_row, longest_row_chars as u32)
400 }
401 }
402
403 #[inline(always)]
404 pub fn offset_to_point(&self, offset: usize) -> Point {
405 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
406 let row = (self.newlines & mask).count_ones();
407 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
408 let column = (offset - newline_ix as usize) as u32;
409 Point::new(row, column)
410 }
411
412 #[inline(always)]
413 pub fn point_to_offset(&self, point: Point) -> usize {
414 if point.row > self.lines().row {
415 debug_panic!(
416 "point {:?} extends beyond rows for string {:?}",
417 point,
418 self.text
419 );
420 return self.len();
421 }
422
423 let row_offset_range = self.offset_range_for_row(point.row);
424 if point.column > row_offset_range.len() as u32 {
425 debug_panic!(
426 "point {:?} extends beyond row for string {:?}",
427 point,
428 self.text
429 );
430 row_offset_range.end
431 } else {
432 row_offset_range.start + point.column as usize
433 }
434 }
435
436 #[track_caller]
437 #[inline(always)]
438 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
439 if self.is_char_boundary(offset) {
440 return true;
441 }
442 if PANIC {
443 panic_char_boundary(self.text, offset);
444 } else {
445 log_err_char_boundary(self.text, offset);
446 false
447 }
448 }
449
450 pub fn floor_char_boundary(&self, index: usize) -> usize {
451 self.text.floor_char_boundary(index)
452 }
453
454 #[inline(always)]
455 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
456 if point.row > self.lines().row {
457 debug_panic!(
458 "point {:?} extends beyond rows for string {:?}",
459 point,
460 self.text
461 );
462 return self.len_utf16();
463 }
464 self.offset_to_offset_utf16(self.point_to_offset(point))
465 }
466
467 #[inline(always)]
468 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
469 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
470 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
471 }
472
473 #[inline(always)]
474 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
475 if target.0 == 0 {
476 0
477 } else {
478 #[cfg(not(test))]
479 let chars_utf16 = self.chars_utf16;
480 #[cfg(test)]
481 let chars_utf16 = self.chars_utf16 as u128;
482 let ix = nth_set_bit(chars_utf16, target.0) + 1;
483 if ix == MAX_BASE {
484 MAX_BASE
485 } else {
486 let utf8_additional_len = cmp::min(
487 (self.chars_utf16 >> ix).trailing_zeros() as usize,
488 self.text.len() - ix,
489 );
490 ix + utf8_additional_len
491 }
492 }
493 }
494
495 #[inline(always)]
496 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
497 let mask = saturating_shl_mask(offset as u32);
498 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
499 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
500 let column = if newline_ix as usize == MAX_BASE {
501 0
502 } else {
503 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
504 };
505 PointUtf16::new(row, column)
506 }
507
508 #[inline(always)]
509 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
510 self.offset_to_point_utf16(self.point_to_offset(point))
511 }
512
513 #[inline(always)]
514 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
515 let lines = self.lines();
516 if point.row > lines.row {
517 if !clip {
518 debug_panic!(
519 "point {:?} is beyond this chunk's extent {:?}",
520 point,
521 self.text
522 );
523 }
524 return self.len();
525 }
526
527 let row_offset_range = self.offset_range_for_row(point.row);
528 let line = self.slice(row_offset_range.clone());
529 if point.column > line.last_line_len_utf16() {
530 if !clip {
531 debug_panic!(
532 "point {:?} is beyond the end of the line in chunk {:?}",
533 point,
534 self.text
535 );
536 }
537 return row_offset_range.end;
538 }
539
540 let mut offset = row_offset_range.start;
541 if point.column > 0 {
542 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
543 if !self.text.is_char_boundary(offset) {
544 offset -= 1;
545 while !self.text.is_char_boundary(offset) {
546 offset -= 1;
547 }
548 if !clip {
549 debug_panic!(
550 "point {:?} is within character in chunk {:?}",
551 point,
552 self.text,
553 );
554 }
555 }
556 }
557 offset
558 }
559
560 #[inline(always)]
561 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
562 let max_point = self.lines();
563 if point.0.row > max_point.row {
564 return max_point;
565 }
566
567 let row_offset_range = self.offset_range_for_row(point.0.row);
568 let line = self.slice(row_offset_range);
569 if point.0.column == 0 {
570 Point::new(point.0.row, 0)
571 } else if point.0.column >= line.len_utf16().0 as u32 {
572 Point::new(point.0.row, line.len() as u32)
573 } else {
574 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
575 while !line.text.is_char_boundary(column) {
576 column -= 1;
577 }
578 Point::new(point.0.row, column as u32)
579 }
580 }
581
582 #[inline(always)]
583 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
584 let max_point = self.lines();
585 if point.row > max_point.row {
586 return max_point;
587 }
588
589 let line = self.slice(self.offset_range_for_row(point.row));
590 if point.column == 0 {
591 point
592 } else if point.column >= line.len() as u32 {
593 Point::new(point.row, line.len() as u32)
594 } else {
595 let mut column = point.column as usize;
596 let bytes = line.text.as_bytes();
597 if bytes[column - 1] < 128 && bytes[column] < 128 {
598 return Point::new(point.row, column as u32);
599 }
600
601 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
602 loop {
603 if line.is_char_boundary(column)
604 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
605 {
606 break;
607 }
608
609 match bias {
610 Bias::Left => column -= 1,
611 Bias::Right => column += 1,
612 }
613 grapheme_cursor.set_cursor(column);
614 }
615 Point::new(point.row, column as u32)
616 }
617 }
618
619 #[inline(always)]
620 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
621 let max_point = self.lines();
622 if point.0.row > max_point.row {
623 PointUtf16::new(max_point.row, self.last_line_len_utf16())
624 } else {
625 let line = self.slice(self.offset_range_for_row(point.0.row));
626 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
627 PointUtf16::new(point.0.row, column.0 as u32)
628 }
629 }
630
631 #[inline(always)]
632 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
633 if target == OffsetUtf16::default() {
634 OffsetUtf16::default()
635 } else if target >= self.len_utf16() {
636 self.len_utf16()
637 } else {
638 let mut offset = self.offset_utf16_to_offset(target);
639 while !self.text.is_char_boundary(offset) {
640 if bias == Bias::Left {
641 offset -= 1;
642 } else {
643 offset += 1;
644 }
645 }
646 self.offset_to_offset_utf16(offset)
647 }
648 }
649
650 #[inline(always)]
651 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
652 let row_start = if row > 0 {
653 #[cfg(not(test))]
654 let newlines = self.newlines;
655 #[cfg(test)]
656 let newlines = self.newlines as u128;
657 nth_set_bit(newlines, row as usize) + 1
658 } else {
659 0
660 };
661 let row_len = if row_start == MAX_BASE {
662 0
663 } else {
664 cmp::min(
665 (self.newlines >> row_start).trailing_zeros(),
666 (self.text.len() - row_start) as u32,
667 )
668 };
669 row_start..row_start + row_len as usize
670 }
671
672 #[inline(always)]
673 pub fn tabs(&self) -> Tabs {
674 Tabs {
675 tabs: self.tabs,
676 chars: self.chars,
677 }
678 }
679}
680
681pub struct Tabs {
682 tabs: Bitmap,
683 chars: Bitmap,
684}
685
686#[derive(Debug, PartialEq, Eq)]
687pub struct TabPosition {
688 pub byte_offset: usize,
689 pub char_offset: usize,
690}
691
692impl Iterator for Tabs {
693 type Item = TabPosition;
694
695 fn next(&mut self) -> Option<Self::Item> {
696 if self.tabs == 0 {
697 return None;
698 }
699
700 let tab_offset = self.tabs.trailing_zeros() as usize;
701 let chars_mask = (1 << tab_offset) - 1;
702 let char_offset = (self.chars & chars_mask).count_ones() as usize;
703
704 // Since tabs are 1 byte the tab offset is the same as the byte offset
705 let position = TabPosition {
706 byte_offset: tab_offset,
707 char_offset,
708 };
709 // Remove the tab we've just seen
710 self.tabs ^= 1 << tab_offset;
711
712 Some(position)
713 }
714}
715
716/// Finds the n-th bit that is set to 1.
717#[inline(always)]
718fn nth_set_bit(v: u128, n: usize) -> usize {
719 let low = v as u64;
720 let high = (v >> 64) as u64;
721
722 let low_count = low.count_ones() as usize;
723 if n > low_count {
724 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
725 } else {
726 nth_set_bit_u64(low, n as u64) as usize
727 }
728}
729
730#[cold]
731#[inline(never)]
732#[track_caller]
733fn panic_char_boundary(text: &str, offset: usize) -> ! {
734 if offset > text.len() {
735 panic!(
736 "byte index {} is out of bounds of `{:?}` (length: {})",
737 offset,
738 text,
739 text.len()
740 );
741 }
742 // find the character
743 let char_start = text.floor_char_boundary(offset);
744 // `char_start` must be less than len and a char boundary
745 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
746 let char_range = char_start..char_start + ch.len_utf8();
747 panic!(
748 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
749 offset, ch, char_range,
750 );
751}
752
753#[cold]
754#[inline(never)]
755#[track_caller]
756fn log_err_char_boundary(text: &str, offset: usize) {
757 if offset >= text.len() {
758 log::error!(
759 "byte index {} is out of bounds of `{:?}` (length: {})",
760 offset,
761 text,
762 text.len()
763 );
764 return;
765 }
766 // find the character
767 let char_start = text.floor_char_boundary(offset);
768 // `char_start` must be less than len and a char boundary
769 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
770 let char_range = char_start..char_start + ch.len_utf8();
771 log::error!(
772 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
773 offset,
774 ch,
775 char_range,
776 );
777}
778
779#[inline(always)]
780fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
781 let v = v.reverse_bits();
782 let mut s: u64 = 64;
783
784 // Parallel bit count intermediates
785 let a = v - ((v >> 1) & (u64::MAX / 3));
786 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
787 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
788 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
789
790 // Branchless select
791 let t = (d >> 32) + (d >> 48);
792 s -= (t.wrapping_sub(n) & 256) >> 3;
793 n -= t & (t.wrapping_sub(n) >> 8);
794
795 let t = (d >> (s - 16)) & 0xff;
796 s -= (t.wrapping_sub(n) & 256) >> 4;
797 n -= t & (t.wrapping_sub(n) >> 8);
798
799 let t = (c >> (s - 8)) & 0xf;
800 s -= (t.wrapping_sub(n) & 256) >> 5;
801 n -= t & (t.wrapping_sub(n) >> 8);
802
803 let t = (b >> (s - 4)) & 0x7;
804 s -= (t.wrapping_sub(n) & 256) >> 6;
805 n -= t & (t.wrapping_sub(n) >> 8);
806
807 let t = (a >> (s - 2)) & 0x3;
808 s -= (t.wrapping_sub(n) & 256) >> 7;
809 n -= t & (t.wrapping_sub(n) >> 8);
810
811 let t = (v >> (s - 1)) & 0x1;
812 s -= (t.wrapping_sub(n) & 256) >> 8;
813
814 65 - s - 1
815}
816
817#[cfg(test)]
818mod tests {
819 use super::*;
820 use rand::prelude::*;
821 use util::RandomCharIter;
822
823 #[gpui::test(iterations = 100)]
824 fn test_random_chunks(mut rng: StdRng) {
825 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
826 log::info!("Chunk: {:?}", text);
827 let chunk = Chunk::new(&text);
828 verify_chunk(chunk.as_slice(), &text);
829
830 // Verify Chunk::chars() bitmap
831 let expected_chars = char_offsets(&text)
832 .into_iter()
833 .inspect(|i| assert!(*i < MAX_BASE))
834 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
835 assert_eq!(chunk.chars(), expected_chars);
836
837 for _ in 0..10 {
838 let mut start = rng.random_range(0..=chunk.text.len());
839 let mut end = rng.random_range(start..=chunk.text.len());
840 while !chunk.text.is_char_boundary(start) {
841 start -= 1;
842 }
843 while !chunk.text.is_char_boundary(end) {
844 end -= 1;
845 }
846 let range = start..end;
847 log::info!("Range: {:?}", range);
848 let text_slice = &text[range.clone()];
849 let chunk_slice = chunk.slice(range);
850 verify_chunk(chunk_slice, text_slice);
851 }
852 }
853
854 #[gpui::test(iterations = 100)]
855 fn test_split_chunk_slice(mut rng: StdRng) {
856 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
857 let chunk = Chunk::new(text);
858 let offset = char_offsets_with_end(text)
859 .into_iter()
860 .choose(&mut rng)
861 .unwrap();
862 let (a, b) = chunk.as_slice().split_at(offset);
863 let (a_str, b_str) = text.split_at(offset);
864 verify_chunk(a, a_str);
865 verify_chunk(b, b_str);
866 }
867
868 #[gpui::test(iterations = 1000)]
869 fn test_nth_set_bit_random(mut rng: StdRng) {
870 let set_count = rng.random_range(0..=128);
871 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
872 set_bits.sort();
873 let mut n = 0;
874 for ix in set_bits.iter().copied() {
875 n |= 1 << ix;
876 }
877
878 for (mut ix, position) in set_bits.into_iter().enumerate() {
879 ix += 1;
880 assert_eq!(
881 nth_set_bit(n, ix),
882 position,
883 "nth_set_bit({:0128b}, {})",
884 n,
885 ix
886 );
887 }
888 }
889
890 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
891 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
892 let mut str = String::new();
893 let mut chars = RandomCharIter::new(rng);
894 loop {
895 let ch = chars.next().unwrap();
896 if str.len() + ch.len_utf8() > len {
897 break;
898 }
899 str.push(ch);
900 }
901 str
902 }
903
904 #[gpui::test(iterations = 1000)]
905 fn test_append_random_strings(mut rng: StdRng) {
906 let len1 = rng.random_range(0..=MAX_BASE);
907 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
908 let str1 = random_string_with_utf8_len(&mut rng, len1);
909 let str2 = random_string_with_utf8_len(&mut rng, len2);
910 let mut chunk1 = Chunk::new(&str1);
911 let chunk2 = Chunk::new(&str2);
912 let char_offsets = char_offsets_with_end(&str2);
913 let start_index = rng.random_range(0..char_offsets.len());
914 let start_offset = char_offsets[start_index];
915 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
916 chunk1.append(chunk2.slice(start_offset..end_offset));
917 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
918 }
919
920 #[gpui::test(iterations = 1000)]
921 fn test_prepend_random_strings(mut rng: StdRng) {
922 let len1 = rng.random_range(0..=MAX_BASE);
923 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
924 let str1 = random_string_with_utf8_len(&mut rng, len1);
925 let str2 = random_string_with_utf8_len(&mut rng, len2);
926 let mut chunk1 = Chunk::new(&str1);
927 let chunk2 = Chunk::new(&str2);
928 let char_offsets = char_offsets_with_end(&str2);
929 let start_index = rng.random_range(0..char_offsets.len());
930 let start_offset = char_offsets[start_index];
931 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
932 let slice = chunk2.slice(start_offset..end_offset);
933 let prefix_text = &str2[start_offset..end_offset];
934 chunk1.prepend(slice);
935 verify_chunk(chunk1.as_slice(), &(prefix_text.to_owned() + &str1));
936 }
937
938 /// Return the byte offsets for each character in a string.
939 ///
940 /// These are valid offsets to split the string.
941 fn char_offsets(text: &str) -> Vec<usize> {
942 text.char_indices().map(|(i, _c)| i).collect()
943 }
944
945 /// Return the byte offsets for each character in a string, plus the offset
946 /// past the end of the string.
947 fn char_offsets_with_end(text: &str) -> Vec<usize> {
948 let mut v = char_offsets(text);
949 v.push(text.len());
950 v
951 }
952
953 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
954 let mut offset = 0;
955 let mut offset_utf16 = OffsetUtf16(0);
956 let mut point = Point::zero();
957 let mut point_utf16 = PointUtf16::zero();
958
959 log::info!("Verifying chunk {:?}", text);
960 assert_eq!(chunk.offset_to_point(0), Point::zero());
961
962 let mut expected_tab_positions = Vec::new();
963
964 for (char_offset, c) in text.chars().enumerate() {
965 let expected_point = chunk.offset_to_point(offset);
966 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
967 assert_eq!(
968 chunk.point_to_offset(point),
969 offset,
970 "mismatch at point {:?}",
971 point
972 );
973 assert_eq!(
974 chunk.offset_to_offset_utf16(offset),
975 offset_utf16,
976 "mismatch at offset {}",
977 offset
978 );
979 assert_eq!(
980 chunk.offset_utf16_to_offset(offset_utf16),
981 offset,
982 "mismatch at offset_utf16 {:?}",
983 offset_utf16
984 );
985 assert_eq!(
986 chunk.point_to_point_utf16(point),
987 point_utf16,
988 "mismatch at point {:?}",
989 point
990 );
991 assert_eq!(
992 chunk.point_utf16_to_offset(point_utf16, false),
993 offset,
994 "mismatch at point_utf16 {:?}",
995 point_utf16
996 );
997 assert_eq!(
998 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
999 point,
1000 "mismatch for unclipped_point_utf16_to_point at {:?}",
1001 point_utf16
1002 );
1003
1004 assert_eq!(
1005 chunk.clip_point(point, Bias::Left),
1006 point,
1007 "incorrect left clip at {:?}",
1008 point
1009 );
1010 assert_eq!(
1011 chunk.clip_point(point, Bias::Right),
1012 point,
1013 "incorrect right clip at {:?}",
1014 point
1015 );
1016
1017 for i in 1..c.len_utf8() {
1018 let test_point = Point::new(point.row, point.column + i as u32);
1019 assert_eq!(
1020 chunk.clip_point(test_point, Bias::Left),
1021 point,
1022 "incorrect left clip within multi-byte char at {:?}",
1023 test_point
1024 );
1025 assert_eq!(
1026 chunk.clip_point(test_point, Bias::Right),
1027 Point::new(point.row, point.column + c.len_utf8() as u32),
1028 "incorrect right clip within multi-byte char at {:?}",
1029 test_point
1030 );
1031 }
1032
1033 for i in 1..c.len_utf16() {
1034 let test_point = Unclipped(PointUtf16::new(
1035 point_utf16.row,
1036 point_utf16.column + i as u32,
1037 ));
1038 assert_eq!(
1039 chunk.unclipped_point_utf16_to_point(test_point),
1040 point,
1041 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
1042 test_point
1043 );
1044 assert_eq!(
1045 chunk.clip_point_utf16(test_point, Bias::Left),
1046 point_utf16,
1047 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1048 test_point
1049 );
1050 assert_eq!(
1051 chunk.clip_point_utf16(test_point, Bias::Right),
1052 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1053 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1054 test_point
1055 );
1056
1057 let test_offset = OffsetUtf16(offset_utf16.0 + i);
1058 assert_eq!(
1059 chunk.clip_offset_utf16(test_offset, Bias::Left),
1060 offset_utf16,
1061 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1062 test_offset
1063 );
1064 assert_eq!(
1065 chunk.clip_offset_utf16(test_offset, Bias::Right),
1066 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1067 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1068 test_offset
1069 );
1070 }
1071
1072 if c == '\n' {
1073 point.row += 1;
1074 point.column = 0;
1075 point_utf16.row += 1;
1076 point_utf16.column = 0;
1077 } else {
1078 point.column += c.len_utf8() as u32;
1079 point_utf16.column += c.len_utf16() as u32;
1080 }
1081
1082 if c == '\t' {
1083 expected_tab_positions.push(TabPosition {
1084 byte_offset: offset,
1085 char_offset,
1086 });
1087 }
1088
1089 offset += c.len_utf8();
1090 offset_utf16.0 += c.len_utf16();
1091 }
1092
1093 let final_point = chunk.offset_to_point(offset);
1094 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1095 assert_eq!(
1096 chunk.point_to_offset(point),
1097 offset,
1098 "mismatch at point {:?}",
1099 point
1100 );
1101 assert_eq!(
1102 chunk.offset_to_offset_utf16(offset),
1103 offset_utf16,
1104 "mismatch at offset {}",
1105 offset
1106 );
1107 assert_eq!(
1108 chunk.offset_utf16_to_offset(offset_utf16),
1109 offset,
1110 "mismatch at offset_utf16 {:?}",
1111 offset_utf16
1112 );
1113 assert_eq!(
1114 chunk.point_to_point_utf16(point),
1115 point_utf16,
1116 "mismatch at final point {:?}",
1117 point
1118 );
1119 assert_eq!(
1120 chunk.point_utf16_to_offset(point_utf16, false),
1121 offset,
1122 "mismatch at final point_utf16 {:?}",
1123 point_utf16
1124 );
1125 assert_eq!(
1126 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1127 point,
1128 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1129 point_utf16
1130 );
1131 assert_eq!(
1132 chunk.clip_point(point, Bias::Left),
1133 point,
1134 "incorrect left clip at final point {:?}",
1135 point
1136 );
1137 assert_eq!(
1138 chunk.clip_point(point, Bias::Right),
1139 point,
1140 "incorrect right clip at final point {:?}",
1141 point
1142 );
1143 assert_eq!(
1144 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1145 point_utf16,
1146 "incorrect left clip_point_utf16 at final point {:?}",
1147 point_utf16
1148 );
1149 assert_eq!(
1150 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1151 point_utf16,
1152 "incorrect right clip_point_utf16 at final point {:?}",
1153 point_utf16
1154 );
1155 assert_eq!(
1156 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1157 offset_utf16,
1158 "incorrect left clip_offset_utf16 at final offset {:?}",
1159 offset_utf16
1160 );
1161 assert_eq!(
1162 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1163 offset_utf16,
1164 "incorrect right clip_offset_utf16 at final offset {:?}",
1165 offset_utf16
1166 );
1167
1168 // Verify length methods
1169 assert_eq!(chunk.len(), text.len());
1170 assert_eq!(
1171 chunk.len_utf16().0,
1172 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1173 );
1174
1175 // Verify line counting
1176 let lines = chunk.lines();
1177 let mut newline_count = 0;
1178 let mut last_line_len = 0;
1179 for c in text.chars() {
1180 if c == '\n' {
1181 newline_count += 1;
1182 last_line_len = 0;
1183 } else {
1184 last_line_len += c.len_utf8() as u32;
1185 }
1186 }
1187 assert_eq!(lines, Point::new(newline_count, last_line_len));
1188
1189 // Verify first/last line chars
1190 if !text.is_empty() {
1191 let first_line = text.split('\n').next().unwrap();
1192 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1193
1194 let last_line = text.split('\n').next_back().unwrap();
1195 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1196 assert_eq!(
1197 chunk.last_line_len_utf16(),
1198 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1199 );
1200 }
1201
1202 // Verify longest row
1203 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1204 let mut max_chars = 0;
1205 let mut current_row = 0;
1206 let mut current_chars = 0;
1207 let mut max_row = 0;
1208
1209 for c in text.chars() {
1210 if c == '\n' {
1211 if current_chars > max_chars {
1212 max_chars = current_chars;
1213 max_row = current_row;
1214 }
1215 current_row += 1;
1216 current_chars = 0;
1217 } else {
1218 current_chars += 1;
1219 }
1220 }
1221
1222 if current_chars > max_chars {
1223 max_chars = current_chars;
1224 max_row = current_row;
1225 }
1226
1227 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1228 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1229 }
1230
1231 #[gpui::test]
1232 fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1233 let text = "abc\nde";
1234 let chunk = Chunk::new(text);
1235 let slice = chunk.as_slice();
1236
1237 // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1238 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1239
1240 // Clipping on row 1 — this is the case that was buggy.
1241 // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1242 // clipped result must be 6, not 2.
1243 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1244 }
1245}