1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let mut this = Chunk::default();
51 this.push_str(text);
52 this
53 }
54
55 #[inline(always)]
56 pub fn push_str(&mut self, text: &str) {
57 for (char_ix, c) in text.char_indices() {
58 let ix = self.text.len() + char_ix;
59 self.chars |= 1 << ix;
60 self.chars_utf16 |= 1 << ix;
61 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
62 self.newlines |= ((c == '\n') as Bitmap) << ix;
63 self.tabs |= ((c == '\t') as Bitmap) << ix;
64 }
65 self.text.push_str(text);
66 }
67
68 #[inline(always)]
69 pub fn append(&mut self, slice: ChunkSlice) {
70 if slice.is_empty() {
71 return;
72 };
73
74 let base_ix = self.text.len();
75 self.chars |= slice.chars << base_ix;
76 self.chars_utf16 |= slice.chars_utf16 << base_ix;
77 self.newlines |= slice.newlines << base_ix;
78 self.tabs |= slice.tabs << base_ix;
79 self.text.push_str(slice.text);
80 }
81
82 #[inline(always)]
83 pub fn as_slice(&self) -> ChunkSlice<'_> {
84 ChunkSlice {
85 chars: self.chars,
86 chars_utf16: self.chars_utf16,
87 newlines: self.newlines,
88 tabs: self.tabs,
89 text: &self.text,
90 }
91 }
92
93 #[inline(always)]
94 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
95 self.as_slice().slice(range)
96 }
97
98 #[inline(always)]
99 pub fn chars(&self) -> Bitmap {
100 self.chars
101 }
102
103 pub fn tabs(&self) -> Bitmap {
104 self.tabs
105 }
106
107 #[inline(always)]
108 pub fn is_char_boundary(&self, offset: usize) -> bool {
109 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
110 }
111
112 pub fn floor_char_boundary(&self, index: usize) -> usize {
113 #[inline]
114 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
115 // This is bit magic equivalent to: b < 128 || b >= 192
116 (u8 as i8) >= -0x40
117 }
118
119 if index >= self.text.len() {
120 self.text.len()
121 } else {
122 let mut i = index;
123 while i > 0 {
124 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
125 break;
126 }
127 i -= 1;
128 }
129
130 i
131 }
132 }
133
134 #[track_caller]
135 #[inline(always)]
136 pub fn assert_char_boundary(&self, offset: usize) {
137 if self.is_char_boundary(offset) {
138 return;
139 }
140 panic_char_boundary(self, offset);
141
142 #[cold]
143 #[inline(never)]
144 #[track_caller]
145 fn panic_char_boundary(chunk: &Chunk, offset: usize) {
146 if offset > chunk.text.len() {
147 panic!(
148 "byte index {} is out of bounds of `{:?}` (length: {})",
149 offset,
150 chunk.text,
151 chunk.text.len()
152 );
153 }
154 // find the character
155 let char_start = chunk.floor_char_boundary(offset);
156 // `char_start` must be less than len and a char boundary
157 let ch = chunk
158 .text
159 .get(char_start..)
160 .unwrap()
161 .chars()
162 .next()
163 .unwrap();
164 let char_range = char_start..char_start + ch.len_utf8();
165 panic!(
166 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
167 offset, ch, char_range,
168 );
169 }
170 }
171}
172
173#[derive(Clone, Copy, Debug)]
174pub struct ChunkSlice<'a> {
175 chars: Bitmap,
176 chars_utf16: Bitmap,
177 newlines: Bitmap,
178 tabs: Bitmap,
179 text: &'a str,
180}
181
182impl Into<Chunk> for ChunkSlice<'_> {
183 fn into(self) -> Chunk {
184 Chunk {
185 chars: self.chars,
186 chars_utf16: self.chars_utf16,
187 newlines: self.newlines,
188 tabs: self.tabs,
189 text: self.text.try_into().unwrap(),
190 }
191 }
192}
193
194impl<'a> ChunkSlice<'a> {
195 #[inline(always)]
196 pub fn is_empty(&self) -> bool {
197 self.text.is_empty()
198 }
199
200 #[inline(always)]
201 pub fn is_char_boundary(&self, offset: usize) -> bool {
202 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
203 }
204
205 #[inline(always)]
206 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
207 if mid == MAX_BASE {
208 let left = self;
209 let right = ChunkSlice {
210 chars: 0,
211 chars_utf16: 0,
212 newlines: 0,
213 tabs: 0,
214 text: "",
215 };
216 (left, right)
217 } else {
218 let mask = ((1 as Bitmap) << mid) - 1;
219 let (left_text, right_text) = self.text.split_at(mid);
220 let left = ChunkSlice {
221 chars: self.chars & mask,
222 chars_utf16: self.chars_utf16 & mask,
223 newlines: self.newlines & mask,
224 tabs: self.tabs & mask,
225 text: left_text,
226 };
227 let right = ChunkSlice {
228 chars: self.chars >> mid,
229 chars_utf16: self.chars_utf16 >> mid,
230 newlines: self.newlines >> mid,
231 tabs: self.tabs >> mid,
232 text: right_text,
233 };
234 (left, right)
235 }
236 }
237
238 #[inline(always)]
239 pub fn slice(self, range: Range<usize>) -> Self {
240 let mask = (1 as Bitmap)
241 .unbounded_shl(range.end as u32)
242 .wrapping_sub(1);
243 if range.start == MAX_BASE {
244 Self {
245 chars: 0,
246 chars_utf16: 0,
247 newlines: 0,
248 tabs: 0,
249 text: "",
250 }
251 } else {
252 self.assert_char_boundary(range.start);
253 self.assert_char_boundary(range.end);
254 Self {
255 chars: (self.chars & mask) >> range.start,
256 chars_utf16: (self.chars_utf16 & mask) >> range.start,
257 newlines: (self.newlines & mask) >> range.start,
258 tabs: (self.tabs & mask) >> range.start,
259 text: &self.text[range],
260 }
261 }
262 }
263
264 #[inline(always)]
265 pub fn text_summary(&self) -> TextSummary {
266 let mut chars = 0;
267 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
268 TextSummary {
269 len: self.len(),
270 chars,
271 len_utf16: self.len_utf16(),
272 lines: self.lines(),
273 first_line_chars: self.first_line_chars(),
274 last_line_chars: self.last_line_chars(),
275 last_line_len_utf16: self.last_line_len_utf16(),
276 longest_row,
277 longest_row_chars,
278 }
279 }
280
281 /// Get length in bytes
282 #[inline(always)]
283 pub fn len(&self) -> usize {
284 self.text.len()
285 }
286
287 /// Get length in UTF-16 code units
288 #[inline(always)]
289 pub fn len_utf16(&self) -> OffsetUtf16 {
290 OffsetUtf16(self.chars_utf16.count_ones() as usize)
291 }
292
293 /// Get point representing number of lines and length of last line
294 #[inline(always)]
295 pub fn lines(&self) -> Point {
296 let row = self.newlines.count_ones();
297 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
298 Point::new(row, column)
299 }
300
301 /// Get number of chars in first line
302 #[inline(always)]
303 pub fn first_line_chars(&self) -> u32 {
304 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
305 }
306
307 /// Get number of chars in last line
308 #[inline(always)]
309 pub fn last_line_chars(&self) -> u32 {
310 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
311 }
312
313 /// Get number of UTF-16 code units in last line
314 #[inline(always)]
315 pub fn last_line_len_utf16(&self) -> u32 {
316 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
317 }
318
319 /// Get the longest row in the chunk and its length in characters.
320 /// Calculate the total number of characters in the chunk along the way.
321 #[inline(always)]
322 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
323 let mut chars = self.chars;
324 let mut newlines = self.newlines;
325 *total_chars = 0;
326 let mut row = 0;
327 let mut longest_row = 0;
328 let mut longest_row_chars = 0;
329 while newlines > 0 {
330 let newline_ix = newlines.trailing_zeros();
331 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
332 *total_chars += usize::from(row_chars);
333 if row_chars > longest_row_chars {
334 longest_row = row;
335 longest_row_chars = row_chars;
336 }
337
338 newlines >>= newline_ix;
339 newlines >>= 1;
340 chars >>= newline_ix;
341 chars >>= 1;
342 row += 1;
343 *total_chars += 1;
344 }
345
346 let row_chars = chars.count_ones() as u8;
347 *total_chars += usize::from(row_chars);
348 if row_chars > longest_row_chars {
349 (row, row_chars as u32)
350 } else {
351 (longest_row, longest_row_chars as u32)
352 }
353 }
354
355 #[inline(always)]
356 pub fn offset_to_point(&self, offset: usize) -> Point {
357 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
358 let row = (self.newlines & mask).count_ones();
359 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
360 let column = (offset - newline_ix as usize) as u32;
361 Point::new(row, column)
362 }
363
364 #[inline(always)]
365 pub fn point_to_offset(&self, point: Point) -> usize {
366 if point.row > self.lines().row {
367 debug_panic!(
368 "point {:?} extends beyond rows for string {:?}",
369 point,
370 self.text
371 );
372 return self.len();
373 }
374
375 let row_offset_range = self.offset_range_for_row(point.row);
376 if point.column > row_offset_range.len() as u32 {
377 debug_panic!(
378 "point {:?} extends beyond row for string {:?}",
379 point,
380 self.text
381 );
382 row_offset_range.end
383 } else {
384 row_offset_range.start + point.column as usize
385 }
386 }
387
388 #[track_caller]
389 #[inline(always)]
390 pub fn assert_char_boundary(&self, offset: usize) {
391 if self.is_char_boundary(offset) {
392 return;
393 }
394 panic_char_boundary(self, offset);
395
396 #[cold]
397 #[inline(never)]
398 fn panic_char_boundary(chunk: &ChunkSlice, offset: usize) {
399 if offset > chunk.text.len() {
400 panic!(
401 "byte index {} is out of bounds of `{:?}` (length: {})",
402 offset,
403 chunk.text,
404 chunk.text.len()
405 );
406 }
407 // find the character
408 let char_start = chunk.floor_char_boundary(offset);
409 // `char_start` must be less than len and a char boundary
410 let ch = chunk
411 .text
412 .get(char_start..)
413 .unwrap()
414 .chars()
415 .next()
416 .unwrap();
417 let char_range = char_start..char_start + ch.len_utf8();
418 panic!(
419 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
420 offset, ch, char_range,
421 );
422 }
423 }
424
425 pub fn floor_char_boundary(&self, index: usize) -> usize {
426 #[inline]
427 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
428 // This is bit magic equivalent to: b < 128 || b >= 192
429 (u8 as i8) >= -0x40
430 }
431
432 if index >= self.text.len() {
433 self.text.len()
434 } else {
435 let mut i = index;
436 while i > 0 {
437 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
438 break;
439 }
440 i -= 1;
441 }
442
443 i
444 }
445 }
446
447 #[inline(always)]
448 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
449 if point.row > self.lines().row {
450 debug_panic!(
451 "point {:?} extends beyond rows for string {:?}",
452 point,
453 self.text
454 );
455 return self.len_utf16();
456 }
457 self.offset_to_offset_utf16(self.point_to_offset(point))
458 }
459
460 #[inline(always)]
461 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
462 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
463 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
464 }
465
466 #[inline(always)]
467 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
468 if target.0 == 0 {
469 0
470 } else {
471 #[cfg(not(test))]
472 let chars_utf16 = self.chars_utf16;
473 #[cfg(test)]
474 let chars_utf16 = self.chars_utf16 as u128;
475 let ix = nth_set_bit(chars_utf16, target.0) + 1;
476 if ix == MAX_BASE {
477 MAX_BASE
478 } else {
479 let utf8_additional_len = cmp::min(
480 (self.chars_utf16 >> ix).trailing_zeros() as usize,
481 self.text.len() - ix,
482 );
483 ix + utf8_additional_len
484 }
485 }
486 }
487
488 #[inline(always)]
489 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
490 let mask = saturating_shl_mask(offset as u32);
491 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
492 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
493 let column = if newline_ix as usize == MAX_BASE {
494 0
495 } else {
496 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
497 };
498 PointUtf16::new(row, column)
499 }
500
501 #[inline(always)]
502 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
503 self.offset_to_point_utf16(self.point_to_offset(point))
504 }
505
506 #[inline(always)]
507 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
508 let lines = self.lines();
509 if point.row > lines.row {
510 if !clip {
511 debug_panic!(
512 "point {:?} is beyond this chunk's extent {:?}",
513 point,
514 self.text
515 );
516 }
517 return self.len();
518 }
519
520 let row_offset_range = self.offset_range_for_row(point.row);
521 let line = self.slice(row_offset_range.clone());
522 if point.column > line.last_line_len_utf16() {
523 if !clip {
524 debug_panic!(
525 "point {:?} is beyond the end of the line in chunk {:?}",
526 point,
527 self.text
528 );
529 }
530 return line.len();
531 }
532
533 let mut offset = row_offset_range.start;
534 if point.column > 0 {
535 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
536 if !self.text.is_char_boundary(offset) {
537 offset -= 1;
538 while !self.text.is_char_boundary(offset) {
539 offset -= 1;
540 }
541 if !clip {
542 debug_panic!(
543 "point {:?} is within character in chunk {:?}",
544 point,
545 self.text,
546 );
547 }
548 }
549 }
550 offset
551 }
552
553 #[inline(always)]
554 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
555 let max_point = self.lines();
556 if point.0.row > max_point.row {
557 return max_point;
558 }
559
560 let row_offset_range = self.offset_range_for_row(point.0.row);
561 let line = self.slice(row_offset_range);
562 if point.0.column == 0 {
563 Point::new(point.0.row, 0)
564 } else if point.0.column >= line.len_utf16().0 as u32 {
565 Point::new(point.0.row, line.len() as u32)
566 } else {
567 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
568 while !line.text.is_char_boundary(column) {
569 column -= 1;
570 }
571 Point::new(point.0.row, column as u32)
572 }
573 }
574
575 #[inline(always)]
576 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
577 let max_point = self.lines();
578 if point.row > max_point.row {
579 return max_point;
580 }
581
582 let line = self.slice(self.offset_range_for_row(point.row));
583 if point.column == 0 {
584 point
585 } else if point.column >= line.len() as u32 {
586 Point::new(point.row, line.len() as u32)
587 } else {
588 let mut column = point.column as usize;
589 let bytes = line.text.as_bytes();
590 if bytes[column - 1] < 128 && bytes[column] < 128 {
591 return Point::new(point.row, column as u32);
592 }
593
594 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
595 loop {
596 if line.is_char_boundary(column)
597 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
598 {
599 break;
600 }
601
602 match bias {
603 Bias::Left => column -= 1,
604 Bias::Right => column += 1,
605 }
606 grapheme_cursor.set_cursor(column);
607 }
608 Point::new(point.row, column as u32)
609 }
610 }
611
612 #[inline(always)]
613 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
614 let max_point = self.lines();
615 if point.0.row > max_point.row {
616 PointUtf16::new(max_point.row, self.last_line_len_utf16())
617 } else {
618 let line = self.slice(self.offset_range_for_row(point.0.row));
619 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
620 PointUtf16::new(point.0.row, column.0 as u32)
621 }
622 }
623
624 #[inline(always)]
625 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
626 if target == OffsetUtf16::default() {
627 OffsetUtf16::default()
628 } else if target >= self.len_utf16() {
629 self.len_utf16()
630 } else {
631 let mut offset = self.offset_utf16_to_offset(target);
632 while !self.text.is_char_boundary(offset) {
633 if bias == Bias::Left {
634 offset -= 1;
635 } else {
636 offset += 1;
637 }
638 }
639 self.offset_to_offset_utf16(offset)
640 }
641 }
642
643 #[inline(always)]
644 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
645 let row_start = if row > 0 {
646 #[cfg(not(test))]
647 let newlines = self.newlines;
648 #[cfg(test)]
649 let newlines = self.newlines as u128;
650 nth_set_bit(newlines, row as usize) + 1
651 } else {
652 0
653 };
654 let row_len = if row_start == MAX_BASE {
655 0
656 } else {
657 cmp::min(
658 (self.newlines >> row_start).trailing_zeros(),
659 (self.text.len() - row_start) as u32,
660 )
661 };
662 row_start..row_start + row_len as usize
663 }
664
665 #[inline(always)]
666 pub fn tabs(&self) -> Tabs {
667 Tabs {
668 tabs: self.tabs,
669 chars: self.chars,
670 }
671 }
672}
673
674pub struct Tabs {
675 tabs: Bitmap,
676 chars: Bitmap,
677}
678
679#[derive(Debug, PartialEq, Eq)]
680pub struct TabPosition {
681 pub byte_offset: usize,
682 pub char_offset: usize,
683}
684
685impl Iterator for Tabs {
686 type Item = TabPosition;
687
688 fn next(&mut self) -> Option<Self::Item> {
689 if self.tabs == 0 {
690 return None;
691 }
692
693 let tab_offset = self.tabs.trailing_zeros() as usize;
694 let chars_mask = (1 << tab_offset) - 1;
695 let char_offset = (self.chars & chars_mask).count_ones() as usize;
696
697 // Since tabs are 1 byte the tab offset is the same as the byte offset
698 let position = TabPosition {
699 byte_offset: tab_offset,
700 char_offset,
701 };
702 // Remove the tab we've just seen
703 self.tabs ^= 1 << tab_offset;
704
705 Some(position)
706 }
707}
708
709/// Finds the n-th bit that is set to 1.
710#[inline(always)]
711fn nth_set_bit(v: u128, n: usize) -> usize {
712 let low = v as u64;
713 let high = (v >> 64) as u64;
714
715 let low_count = low.count_ones() as usize;
716 if n > low_count {
717 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
718 } else {
719 nth_set_bit_u64(low, n as u64) as usize
720 }
721}
722
723#[inline(always)]
724fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
725 let v = v.reverse_bits();
726 let mut s: u64 = 64;
727
728 // Parallel bit count intermediates
729 let a = v - ((v >> 1) & (u64::MAX / 3));
730 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
731 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
732 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
733
734 // Branchless select
735 let t = (d >> 32) + (d >> 48);
736 s -= (t.wrapping_sub(n) & 256) >> 3;
737 n -= t & (t.wrapping_sub(n) >> 8);
738
739 let t = (d >> (s - 16)) & 0xff;
740 s -= (t.wrapping_sub(n) & 256) >> 4;
741 n -= t & (t.wrapping_sub(n) >> 8);
742
743 let t = (c >> (s - 8)) & 0xf;
744 s -= (t.wrapping_sub(n) & 256) >> 5;
745 n -= t & (t.wrapping_sub(n) >> 8);
746
747 let t = (b >> (s - 4)) & 0x7;
748 s -= (t.wrapping_sub(n) & 256) >> 6;
749 n -= t & (t.wrapping_sub(n) >> 8);
750
751 let t = (a >> (s - 2)) & 0x3;
752 s -= (t.wrapping_sub(n) & 256) >> 7;
753 n -= t & (t.wrapping_sub(n) >> 8);
754
755 let t = (v >> (s - 1)) & 0x1;
756 s -= (t.wrapping_sub(n) & 256) >> 8;
757
758 65 - s - 1
759}
760
761#[cfg(test)]
762mod tests {
763 use super::*;
764 use rand::prelude::*;
765 use util::RandomCharIter;
766
767 #[gpui::test(iterations = 100)]
768 fn test_random_chunks(mut rng: StdRng) {
769 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
770 log::info!("Chunk: {:?}", text);
771 let chunk = Chunk::new(&text);
772 verify_chunk(chunk.as_slice(), &text);
773
774 // Verify Chunk::chars() bitmap
775 let expected_chars = char_offsets(&text)
776 .into_iter()
777 .inspect(|i| assert!(*i < MAX_BASE))
778 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
779 assert_eq!(chunk.chars(), expected_chars);
780
781 for _ in 0..10 {
782 let mut start = rng.random_range(0..=chunk.text.len());
783 let mut end = rng.random_range(start..=chunk.text.len());
784 while !chunk.text.is_char_boundary(start) {
785 start -= 1;
786 }
787 while !chunk.text.is_char_boundary(end) {
788 end -= 1;
789 }
790 let range = start..end;
791 log::info!("Range: {:?}", range);
792 let text_slice = &text[range.clone()];
793 let chunk_slice = chunk.slice(range);
794 verify_chunk(chunk_slice, text_slice);
795 }
796 }
797
798 #[gpui::test(iterations = 100)]
799 fn test_split_chunk_slice(mut rng: StdRng) {
800 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
801 let chunk = Chunk::new(text);
802 let offset = char_offsets_with_end(text)
803 .into_iter()
804 .choose(&mut rng)
805 .unwrap();
806 let (a, b) = chunk.as_slice().split_at(offset);
807 let (a_str, b_str) = text.split_at(offset);
808 verify_chunk(a, a_str);
809 verify_chunk(b, b_str);
810 }
811
812 #[gpui::test(iterations = 1000)]
813 fn test_nth_set_bit_random(mut rng: StdRng) {
814 let set_count = rng.random_range(0..=128);
815 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
816 set_bits.sort();
817 let mut n = 0;
818 for ix in set_bits.iter().copied() {
819 n |= 1 << ix;
820 }
821
822 for (mut ix, position) in set_bits.into_iter().enumerate() {
823 ix += 1;
824 assert_eq!(
825 nth_set_bit(n, ix),
826 position,
827 "nth_set_bit({:0128b}, {})",
828 n,
829 ix
830 );
831 }
832 }
833
834 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
835 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
836 let mut str = String::new();
837 let mut chars = RandomCharIter::new(rng);
838 loop {
839 let ch = chars.next().unwrap();
840 if str.len() + ch.len_utf8() > len {
841 break;
842 }
843 str.push(ch);
844 }
845 str
846 }
847
848 #[gpui::test(iterations = 1000)]
849 fn test_append_random_strings(mut rng: StdRng) {
850 let len1 = rng.random_range(0..=MAX_BASE);
851 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
852 let str1 = random_string_with_utf8_len(&mut rng, len1);
853 let str2 = random_string_with_utf8_len(&mut rng, len2);
854 let mut chunk1 = Chunk::new(&str1);
855 let chunk2 = Chunk::new(&str2);
856 let char_offsets = char_offsets_with_end(&str2);
857 let start_index = rng.random_range(0..char_offsets.len());
858 let start_offset = char_offsets[start_index];
859 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
860 chunk1.append(chunk2.slice(start_offset..end_offset));
861 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
862 }
863
864 /// Return the byte offsets for each character in a string.
865 ///
866 /// These are valid offsets to split the string.
867 fn char_offsets(text: &str) -> Vec<usize> {
868 text.char_indices().map(|(i, _c)| i).collect()
869 }
870
871 /// Return the byte offsets for each character in a string, plus the offset
872 /// past the end of the string.
873 fn char_offsets_with_end(text: &str) -> Vec<usize> {
874 let mut v = char_offsets(text);
875 v.push(text.len());
876 v
877 }
878
879 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
880 let mut offset = 0;
881 let mut offset_utf16 = OffsetUtf16(0);
882 let mut point = Point::zero();
883 let mut point_utf16 = PointUtf16::zero();
884
885 log::info!("Verifying chunk {:?}", text);
886 assert_eq!(chunk.offset_to_point(0), Point::zero());
887
888 let mut expected_tab_positions = Vec::new();
889
890 for (char_offset, c) in text.chars().enumerate() {
891 let expected_point = chunk.offset_to_point(offset);
892 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
893 assert_eq!(
894 chunk.point_to_offset(point),
895 offset,
896 "mismatch at point {:?}",
897 point
898 );
899 assert_eq!(
900 chunk.offset_to_offset_utf16(offset),
901 offset_utf16,
902 "mismatch at offset {}",
903 offset
904 );
905 assert_eq!(
906 chunk.offset_utf16_to_offset(offset_utf16),
907 offset,
908 "mismatch at offset_utf16 {:?}",
909 offset_utf16
910 );
911 assert_eq!(
912 chunk.point_to_point_utf16(point),
913 point_utf16,
914 "mismatch at point {:?}",
915 point
916 );
917 assert_eq!(
918 chunk.point_utf16_to_offset(point_utf16, false),
919 offset,
920 "mismatch at point_utf16 {:?}",
921 point_utf16
922 );
923 assert_eq!(
924 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
925 point,
926 "mismatch for unclipped_point_utf16_to_point at {:?}",
927 point_utf16
928 );
929
930 assert_eq!(
931 chunk.clip_point(point, Bias::Left),
932 point,
933 "incorrect left clip at {:?}",
934 point
935 );
936 assert_eq!(
937 chunk.clip_point(point, Bias::Right),
938 point,
939 "incorrect right clip at {:?}",
940 point
941 );
942
943 for i in 1..c.len_utf8() {
944 let test_point = Point::new(point.row, point.column + i as u32);
945 assert_eq!(
946 chunk.clip_point(test_point, Bias::Left),
947 point,
948 "incorrect left clip within multi-byte char at {:?}",
949 test_point
950 );
951 assert_eq!(
952 chunk.clip_point(test_point, Bias::Right),
953 Point::new(point.row, point.column + c.len_utf8() as u32),
954 "incorrect right clip within multi-byte char at {:?}",
955 test_point
956 );
957 }
958
959 for i in 1..c.len_utf16() {
960 let test_point = Unclipped(PointUtf16::new(
961 point_utf16.row,
962 point_utf16.column + i as u32,
963 ));
964 assert_eq!(
965 chunk.unclipped_point_utf16_to_point(test_point),
966 point,
967 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
968 test_point
969 );
970 assert_eq!(
971 chunk.clip_point_utf16(test_point, Bias::Left),
972 point_utf16,
973 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
974 test_point
975 );
976 assert_eq!(
977 chunk.clip_point_utf16(test_point, Bias::Right),
978 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
979 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
980 test_point
981 );
982
983 let test_offset = OffsetUtf16(offset_utf16.0 + i);
984 assert_eq!(
985 chunk.clip_offset_utf16(test_offset, Bias::Left),
986 offset_utf16,
987 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
988 test_offset
989 );
990 assert_eq!(
991 chunk.clip_offset_utf16(test_offset, Bias::Right),
992 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
993 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
994 test_offset
995 );
996 }
997
998 if c == '\n' {
999 point.row += 1;
1000 point.column = 0;
1001 point_utf16.row += 1;
1002 point_utf16.column = 0;
1003 } else {
1004 point.column += c.len_utf8() as u32;
1005 point_utf16.column += c.len_utf16() as u32;
1006 }
1007
1008 if c == '\t' {
1009 expected_tab_positions.push(TabPosition {
1010 byte_offset: offset,
1011 char_offset,
1012 });
1013 }
1014
1015 offset += c.len_utf8();
1016 offset_utf16.0 += c.len_utf16();
1017 }
1018
1019 let final_point = chunk.offset_to_point(offset);
1020 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1021 assert_eq!(
1022 chunk.point_to_offset(point),
1023 offset,
1024 "mismatch at point {:?}",
1025 point
1026 );
1027 assert_eq!(
1028 chunk.offset_to_offset_utf16(offset),
1029 offset_utf16,
1030 "mismatch at offset {}",
1031 offset
1032 );
1033 assert_eq!(
1034 chunk.offset_utf16_to_offset(offset_utf16),
1035 offset,
1036 "mismatch at offset_utf16 {:?}",
1037 offset_utf16
1038 );
1039 assert_eq!(
1040 chunk.point_to_point_utf16(point),
1041 point_utf16,
1042 "mismatch at final point {:?}",
1043 point
1044 );
1045 assert_eq!(
1046 chunk.point_utf16_to_offset(point_utf16, false),
1047 offset,
1048 "mismatch at final point_utf16 {:?}",
1049 point_utf16
1050 );
1051 assert_eq!(
1052 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1053 point,
1054 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1055 point_utf16
1056 );
1057 assert_eq!(
1058 chunk.clip_point(point, Bias::Left),
1059 point,
1060 "incorrect left clip at final point {:?}",
1061 point
1062 );
1063 assert_eq!(
1064 chunk.clip_point(point, Bias::Right),
1065 point,
1066 "incorrect right clip at final point {:?}",
1067 point
1068 );
1069 assert_eq!(
1070 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1071 point_utf16,
1072 "incorrect left clip_point_utf16 at final point {:?}",
1073 point_utf16
1074 );
1075 assert_eq!(
1076 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1077 point_utf16,
1078 "incorrect right clip_point_utf16 at final point {:?}",
1079 point_utf16
1080 );
1081 assert_eq!(
1082 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1083 offset_utf16,
1084 "incorrect left clip_offset_utf16 at final offset {:?}",
1085 offset_utf16
1086 );
1087 assert_eq!(
1088 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1089 offset_utf16,
1090 "incorrect right clip_offset_utf16 at final offset {:?}",
1091 offset_utf16
1092 );
1093
1094 // Verify length methods
1095 assert_eq!(chunk.len(), text.len());
1096 assert_eq!(
1097 chunk.len_utf16().0,
1098 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1099 );
1100
1101 // Verify line counting
1102 let lines = chunk.lines();
1103 let mut newline_count = 0;
1104 let mut last_line_len = 0;
1105 for c in text.chars() {
1106 if c == '\n' {
1107 newline_count += 1;
1108 last_line_len = 0;
1109 } else {
1110 last_line_len += c.len_utf8() as u32;
1111 }
1112 }
1113 assert_eq!(lines, Point::new(newline_count, last_line_len));
1114
1115 // Verify first/last line chars
1116 if !text.is_empty() {
1117 let first_line = text.split('\n').next().unwrap();
1118 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1119
1120 let last_line = text.split('\n').next_back().unwrap();
1121 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1122 assert_eq!(
1123 chunk.last_line_len_utf16(),
1124 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1125 );
1126 }
1127
1128 // Verify longest row
1129 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1130 let mut max_chars = 0;
1131 let mut current_row = 0;
1132 let mut current_chars = 0;
1133 let mut max_row = 0;
1134
1135 for c in text.chars() {
1136 if c == '\n' {
1137 if current_chars > max_chars {
1138 max_chars = current_chars;
1139 max_row = current_row;
1140 }
1141 current_row += 1;
1142 current_chars = 0;
1143 } else {
1144 current_chars += 1;
1145 }
1146 }
1147
1148 if current_chars > max_chars {
1149 max_chars = current_chars;
1150 max_row = current_row;
1151 }
1152
1153 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1154 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1155 }
1156}