1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35impl Chunk {
36 pub const MASK_BITS: usize = Bitmap::BITS as usize;
37
38 #[inline(always)]
39 pub fn new(text: &str) -> Self {
40 let mut this = Chunk::default();
41 this.push_str(text);
42 this
43 }
44
45 #[inline(always)]
46 pub fn push_str(&mut self, text: &str) {
47 for (char_ix, c) in text.char_indices() {
48 let ix = self.text.len() + char_ix;
49 self.chars |= 1 << ix;
50 self.chars_utf16 |= 1 << ix;
51 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
52 self.newlines |= ((c == '\n') as Bitmap) << ix;
53 self.tabs |= ((c == '\t') as Bitmap) << ix;
54 }
55 self.text.push_str(text);
56 }
57
58 #[inline(always)]
59 pub fn append(&mut self, slice: ChunkSlice) {
60 if slice.is_empty() {
61 return;
62 };
63
64 let base_ix = self.text.len();
65 self.chars |= slice.chars << base_ix;
66 self.chars_utf16 |= slice.chars_utf16 << base_ix;
67 self.newlines |= slice.newlines << base_ix;
68 self.tabs |= slice.tabs << base_ix;
69 self.text.push_str(slice.text);
70 }
71
72 #[inline(always)]
73 pub fn as_slice(&self) -> ChunkSlice<'_> {
74 ChunkSlice {
75 chars: self.chars,
76 chars_utf16: self.chars_utf16,
77 newlines: self.newlines,
78 tabs: self.tabs,
79 text: &self.text,
80 }
81 }
82
83 #[inline(always)]
84 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
85 self.as_slice().slice(range)
86 }
87
88 #[inline(always)]
89 pub fn chars(&self) -> Bitmap {
90 self.chars
91 }
92
93 pub fn tabs(&self) -> Bitmap {
94 self.tabs
95 }
96
97 #[inline(always)]
98 pub fn is_char_boundary(&self, offset: usize) -> bool {
99 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
100 }
101
102 pub fn floor_char_boundary(&self, index: usize) -> usize {
103 #[inline]
104 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
105 // This is bit magic equivalent to: b < 128 || b >= 192
106 (u8 as i8) >= -0x40
107 }
108
109 if index >= self.text.len() {
110 self.text.len()
111 } else {
112 let mut i = index;
113 while i > 0 {
114 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
115 break;
116 }
117 i -= 1;
118 }
119
120 i
121 }
122 }
123
124 #[track_caller]
125 #[inline(always)]
126 pub fn assert_char_boundary(&self, offset: usize) {
127 if self.is_char_boundary(offset) {
128 return;
129 }
130 panic_char_boundary(self, offset);
131
132 #[cold]
133 #[inline(never)]
134 fn panic_char_boundary(chunk: &Chunk, offset: usize) {
135 if offset > chunk.text.len() {
136 panic!(
137 "byte index {} is out of bounds of `{:?}` (length: {})",
138 offset,
139 chunk.text,
140 chunk.text.len()
141 );
142 }
143 // find the character
144 let char_start = chunk.floor_char_boundary(offset);
145 // `char_start` must be less than len and a char boundary
146 let ch = chunk
147 .text
148 .get(char_start..)
149 .unwrap()
150 .chars()
151 .next()
152 .unwrap();
153 let char_range = char_start..char_start + ch.len_utf8();
154 panic!(
155 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
156 offset, ch, char_range,
157 );
158 }
159 }
160}
161
162#[derive(Clone, Copy, Debug)]
163pub struct ChunkSlice<'a> {
164 chars: Bitmap,
165 chars_utf16: Bitmap,
166 newlines: Bitmap,
167 tabs: Bitmap,
168 text: &'a str,
169}
170
171impl Into<Chunk> for ChunkSlice<'_> {
172 fn into(self) -> Chunk {
173 Chunk {
174 chars: self.chars,
175 chars_utf16: self.chars_utf16,
176 newlines: self.newlines,
177 tabs: self.tabs,
178 text: self.text.try_into().unwrap(),
179 }
180 }
181}
182
183impl<'a> ChunkSlice<'a> {
184 #[inline(always)]
185 pub fn is_empty(&self) -> bool {
186 self.text.is_empty()
187 }
188
189 #[inline(always)]
190 pub fn is_char_boundary(&self, offset: usize) -> bool {
191 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
192 }
193
194 #[inline(always)]
195 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
196 if mid == MAX_BASE {
197 let left = self;
198 let right = ChunkSlice {
199 chars: 0,
200 chars_utf16: 0,
201 newlines: 0,
202 tabs: 0,
203 text: "",
204 };
205 (left, right)
206 } else {
207 let mask = ((1 as Bitmap) << mid) - 1;
208 let (left_text, right_text) = self.text.split_at(mid);
209 let left = ChunkSlice {
210 chars: self.chars & mask,
211 chars_utf16: self.chars_utf16 & mask,
212 newlines: self.newlines & mask,
213 tabs: self.tabs & mask,
214 text: left_text,
215 };
216 let right = ChunkSlice {
217 chars: self.chars >> mid,
218 chars_utf16: self.chars_utf16 >> mid,
219 newlines: self.newlines >> mid,
220 tabs: self.tabs >> mid,
221 text: right_text,
222 };
223 (left, right)
224 }
225 }
226
227 #[inline(always)]
228 pub fn slice(self, range: Range<usize>) -> Self {
229 let mask = (1 as Bitmap)
230 .unbounded_shl(range.end as u32)
231 .wrapping_sub(1);
232 if range.start == MAX_BASE {
233 Self {
234 chars: 0,
235 chars_utf16: 0,
236 newlines: 0,
237 tabs: 0,
238 text: "",
239 }
240 } else {
241 self.assert_char_boundary(range.start);
242 self.assert_char_boundary(range.end);
243 Self {
244 chars: (self.chars & mask) >> range.start,
245 chars_utf16: (self.chars_utf16 & mask) >> range.start,
246 newlines: (self.newlines & mask) >> range.start,
247 tabs: (self.tabs & mask) >> range.start,
248 text: &self.text[range],
249 }
250 }
251 }
252
253 #[inline(always)]
254 pub fn text_summary(&self) -> TextSummary {
255 let mut chars = 0;
256 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
257 TextSummary {
258 len: self.len(),
259 chars,
260 len_utf16: self.len_utf16(),
261 lines: self.lines(),
262 first_line_chars: self.first_line_chars(),
263 last_line_chars: self.last_line_chars(),
264 last_line_len_utf16: self.last_line_len_utf16(),
265 longest_row,
266 longest_row_chars,
267 }
268 }
269
270 /// Get length in bytes
271 #[inline(always)]
272 pub fn len(&self) -> usize {
273 self.text.len()
274 }
275
276 /// Get length in UTF-16 code units
277 #[inline(always)]
278 pub fn len_utf16(&self) -> OffsetUtf16 {
279 OffsetUtf16(self.chars_utf16.count_ones() as usize)
280 }
281
282 /// Get point representing number of lines and length of last line
283 #[inline(always)]
284 pub fn lines(&self) -> Point {
285 let row = self.newlines.count_ones();
286 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
287 Point::new(row, column)
288 }
289
290 /// Get number of chars in first line
291 #[inline(always)]
292 pub fn first_line_chars(&self) -> u32 {
293 if self.newlines == 0 {
294 self.chars.count_ones()
295 } else {
296 let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
297 (self.chars & mask).count_ones()
298 }
299 }
300
301 /// Get number of chars in last line
302 #[inline(always)]
303 pub fn last_line_chars(&self) -> u32 {
304 if self.newlines == 0 {
305 self.chars.count_ones()
306 } else {
307 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
308 (self.chars & mask).count_ones()
309 }
310 }
311
312 /// Get number of UTF-16 code units in last line
313 #[inline(always)]
314 pub fn last_line_len_utf16(&self) -> u32 {
315 if self.newlines == 0 {
316 self.chars_utf16.count_ones()
317 } else {
318 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
319 (self.chars_utf16 & mask).count_ones()
320 }
321 }
322
323 /// Get the longest row in the chunk and its length in characters.
324 /// Calculate the total number of characters in the chunk along the way.
325 #[inline(always)]
326 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
327 let mut chars = self.chars;
328 let mut newlines = self.newlines;
329 *total_chars = 0;
330 let mut row = 0;
331 let mut longest_row = 0;
332 let mut longest_row_chars = 0;
333 while newlines > 0 {
334 let newline_ix = newlines.trailing_zeros();
335 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
336 *total_chars += usize::from(row_chars);
337 if row_chars > longest_row_chars {
338 longest_row = row;
339 longest_row_chars = row_chars;
340 }
341
342 newlines >>= newline_ix;
343 newlines >>= 1;
344 chars >>= newline_ix;
345 chars >>= 1;
346 row += 1;
347 *total_chars += 1;
348 }
349
350 let row_chars = chars.count_ones() as u8;
351 *total_chars += usize::from(row_chars);
352 if row_chars > longest_row_chars {
353 (row, row_chars as u32)
354 } else {
355 (longest_row, longest_row_chars as u32)
356 }
357 }
358
359 #[inline(always)]
360 pub fn offset_to_point(&self, offset: usize) -> Point {
361 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
362 let row = (self.newlines & mask).count_ones();
363 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
364 let column = (offset - newline_ix as usize) as u32;
365 Point::new(row, column)
366 }
367
368 #[inline(always)]
369 pub fn point_to_offset(&self, point: Point) -> usize {
370 if point.row > self.lines().row {
371 debug_panic!(
372 "point {:?} extends beyond rows for string {:?}",
373 point,
374 self.text
375 );
376 return self.len();
377 }
378
379 let row_offset_range = self.offset_range_for_row(point.row);
380 if point.column > row_offset_range.len() as u32 {
381 debug_panic!(
382 "point {:?} extends beyond row for string {:?}",
383 point,
384 self.text
385 );
386 row_offset_range.end
387 } else {
388 row_offset_range.start + point.column as usize
389 }
390 }
391
392 #[track_caller]
393 #[inline(always)]
394 pub fn assert_char_boundary(&self, offset: usize) {
395 if self.is_char_boundary(offset) {
396 return;
397 }
398 panic_char_boundary(self, offset);
399
400 #[cold]
401 #[inline(never)]
402 fn panic_char_boundary(chunk: &ChunkSlice, offset: usize) {
403 if offset > chunk.text.len() {
404 panic!(
405 "byte index {} is out of bounds of `{:?}` (length: {})",
406 offset,
407 chunk.text,
408 chunk.text.len()
409 );
410 }
411 // find the character
412 let char_start = chunk.floor_char_boundary(offset);
413 // `char_start` must be less than len and a char boundary
414 let ch = chunk
415 .text
416 .get(char_start..)
417 .unwrap()
418 .chars()
419 .next()
420 .unwrap();
421 let char_range = char_start..char_start + ch.len_utf8();
422 panic!(
423 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
424 offset, ch, char_range,
425 );
426 }
427 }
428
429 pub fn floor_char_boundary(&self, index: usize) -> usize {
430 #[inline]
431 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
432 // This is bit magic equivalent to: b < 128 || b >= 192
433 (u8 as i8) >= -0x40
434 }
435
436 if index >= self.text.len() {
437 self.text.len()
438 } else {
439 let mut i = index;
440 while i > 0 {
441 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
442 break;
443 }
444 i -= 1;
445 }
446
447 i
448 }
449 }
450
451 #[inline(always)]
452 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
453 if point.row > self.lines().row {
454 debug_panic!(
455 "point {:?} extends beyond rows for string {:?}",
456 point,
457 self.text
458 );
459 return self.len_utf16();
460 }
461 self.offset_to_offset_utf16(self.point_to_offset(point))
462 }
463
464 #[inline(always)]
465 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
466 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
467 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
468 }
469
470 #[inline(always)]
471 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
472 if target.0 == 0 {
473 0
474 } else {
475 #[cfg(not(test))]
476 let chars_utf16 = self.chars_utf16;
477 #[cfg(test)]
478 let chars_utf16 = self.chars_utf16 as u128;
479 let ix = nth_set_bit(chars_utf16, target.0) + 1;
480 if ix == MAX_BASE {
481 MAX_BASE
482 } else {
483 let utf8_additional_len = cmp::min(
484 (self.chars_utf16 >> ix).trailing_zeros() as usize,
485 self.text.len() - ix,
486 );
487 ix + utf8_additional_len
488 }
489 }
490 }
491
492 #[inline(always)]
493 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
494 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
495 let row = (self.newlines & mask).count_ones();
496 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
497 let column = if newline_ix as usize == MAX_BASE {
498 0
499 } else {
500 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
501 };
502 PointUtf16::new(row, column)
503 }
504
505 #[inline(always)]
506 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
507 self.offset_to_point_utf16(self.point_to_offset(point))
508 }
509
510 #[inline(always)]
511 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
512 let lines = self.lines();
513 if point.row > lines.row {
514 if !clip {
515 debug_panic!(
516 "point {:?} is beyond this chunk's extent {:?}",
517 point,
518 self.text
519 );
520 }
521 return self.len();
522 }
523
524 let row_offset_range = self.offset_range_for_row(point.row);
525 let line = self.slice(row_offset_range.clone());
526 if point.column > line.last_line_len_utf16() {
527 if !clip {
528 debug_panic!(
529 "point {:?} is beyond the end of the line in chunk {:?}",
530 point,
531 self.text
532 );
533 }
534 return line.len();
535 }
536
537 let mut offset = row_offset_range.start;
538 if point.column > 0 {
539 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
540 if !self.text.is_char_boundary(offset) {
541 offset -= 1;
542 while !self.text.is_char_boundary(offset) {
543 offset -= 1;
544 }
545 if !clip {
546 debug_panic!(
547 "point {:?} is within character in chunk {:?}",
548 point,
549 self.text,
550 );
551 }
552 }
553 }
554 offset
555 }
556
557 #[inline(always)]
558 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
559 let max_point = self.lines();
560 if point.0.row > max_point.row {
561 return max_point;
562 }
563
564 let row_offset_range = self.offset_range_for_row(point.0.row);
565 let line = self.slice(row_offset_range);
566 if point.0.column == 0 {
567 Point::new(point.0.row, 0)
568 } else if point.0.column >= line.len_utf16().0 as u32 {
569 Point::new(point.0.row, line.len() as u32)
570 } else {
571 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
572 while !line.text.is_char_boundary(column) {
573 column -= 1;
574 }
575 Point::new(point.0.row, column as u32)
576 }
577 }
578
579 #[inline(always)]
580 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
581 let max_point = self.lines();
582 if point.row > max_point.row {
583 return max_point;
584 }
585
586 let line = self.slice(self.offset_range_for_row(point.row));
587 if point.column == 0 {
588 point
589 } else if point.column >= line.len() as u32 {
590 Point::new(point.row, line.len() as u32)
591 } else {
592 let mut column = point.column as usize;
593 let bytes = line.text.as_bytes();
594 if bytes[column - 1] < 128 && bytes[column] < 128 {
595 return Point::new(point.row, column as u32);
596 }
597
598 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
599 loop {
600 if line.is_char_boundary(column)
601 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
602 {
603 break;
604 }
605
606 match bias {
607 Bias::Left => column -= 1,
608 Bias::Right => column += 1,
609 }
610 grapheme_cursor.set_cursor(column);
611 }
612 Point::new(point.row, column as u32)
613 }
614 }
615
616 #[inline(always)]
617 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
618 let max_point = self.lines();
619 if point.0.row > max_point.row {
620 PointUtf16::new(max_point.row, self.last_line_len_utf16())
621 } else {
622 let line = self.slice(self.offset_range_for_row(point.0.row));
623 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
624 PointUtf16::new(point.0.row, column.0 as u32)
625 }
626 }
627
628 #[inline(always)]
629 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
630 if target == OffsetUtf16::default() {
631 OffsetUtf16::default()
632 } else if target >= self.len_utf16() {
633 self.len_utf16()
634 } else {
635 let mut offset = self.offset_utf16_to_offset(target);
636 while !self.text.is_char_boundary(offset) {
637 if bias == Bias::Left {
638 offset -= 1;
639 } else {
640 offset += 1;
641 }
642 }
643 self.offset_to_offset_utf16(offset)
644 }
645 }
646
647 #[inline(always)]
648 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
649 let row_start = if row > 0 {
650 #[cfg(not(test))]
651 let newlines = self.newlines;
652 #[cfg(test)]
653 let newlines = self.newlines as u128;
654 nth_set_bit(newlines, row as usize) + 1
655 } else {
656 0
657 };
658 let row_len = if row_start == MAX_BASE {
659 0
660 } else {
661 cmp::min(
662 (self.newlines >> row_start).trailing_zeros(),
663 (self.text.len() - row_start) as u32,
664 )
665 };
666 row_start..row_start + row_len as usize
667 }
668
669 #[inline(always)]
670 pub fn tabs(&self) -> Tabs {
671 Tabs {
672 tabs: self.tabs,
673 chars: self.chars,
674 }
675 }
676}
677
678pub struct Tabs {
679 tabs: Bitmap,
680 chars: Bitmap,
681}
682
683#[derive(Debug, PartialEq, Eq)]
684pub struct TabPosition {
685 pub byte_offset: usize,
686 pub char_offset: usize,
687}
688
689impl Iterator for Tabs {
690 type Item = TabPosition;
691
692 fn next(&mut self) -> Option<Self::Item> {
693 if self.tabs == 0 {
694 return None;
695 }
696
697 let tab_offset = self.tabs.trailing_zeros() as usize;
698 let chars_mask = (1 << tab_offset) - 1;
699 let char_offset = (self.chars & chars_mask).count_ones() as usize;
700
701 // Since tabs are 1 byte the tab offset is the same as the byte offset
702 let position = TabPosition {
703 byte_offset: tab_offset,
704 char_offset,
705 };
706 // Remove the tab we've just seen
707 self.tabs ^= 1 << tab_offset;
708
709 Some(position)
710 }
711}
712
713/// Finds the n-th bit that is set to 1.
714#[inline(always)]
715fn nth_set_bit(v: u128, n: usize) -> usize {
716 let low = v as u64;
717 let high = (v >> 64) as u64;
718
719 let low_count = low.count_ones() as usize;
720 if n > low_count {
721 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
722 } else {
723 nth_set_bit_u64(low, n as u64) as usize
724 }
725}
726
727#[inline(always)]
728fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
729 let v = v.reverse_bits();
730 let mut s: u64 = 64;
731
732 // Parallel bit count intermediates
733 let a = v - ((v >> 1) & (u64::MAX / 3));
734 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
735 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
736 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
737
738 // Branchless select
739 let t = (d >> 32) + (d >> 48);
740 s -= (t.wrapping_sub(n) & 256) >> 3;
741 n -= t & (t.wrapping_sub(n) >> 8);
742
743 let t = (d >> (s - 16)) & 0xff;
744 s -= (t.wrapping_sub(n) & 256) >> 4;
745 n -= t & (t.wrapping_sub(n) >> 8);
746
747 let t = (c >> (s - 8)) & 0xf;
748 s -= (t.wrapping_sub(n) & 256) >> 5;
749 n -= t & (t.wrapping_sub(n) >> 8);
750
751 let t = (b >> (s - 4)) & 0x7;
752 s -= (t.wrapping_sub(n) & 256) >> 6;
753 n -= t & (t.wrapping_sub(n) >> 8);
754
755 let t = (a >> (s - 2)) & 0x3;
756 s -= (t.wrapping_sub(n) & 256) >> 7;
757 n -= t & (t.wrapping_sub(n) >> 8);
758
759 let t = (v >> (s - 1)) & 0x1;
760 s -= (t.wrapping_sub(n) & 256) >> 8;
761
762 65 - s - 1
763}
764
765#[cfg(test)]
766mod tests {
767 use super::*;
768 use rand::prelude::*;
769 use util::RandomCharIter;
770
771 #[gpui::test(iterations = 100)]
772 fn test_random_chunks(mut rng: StdRng) {
773 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
774 log::info!("Chunk: {:?}", text);
775 let chunk = Chunk::new(&text);
776 verify_chunk(chunk.as_slice(), &text);
777
778 // Verify Chunk::chars() bitmap
779 let expected_chars = char_offsets(&text)
780 .into_iter()
781 .inspect(|i| assert!(*i < MAX_BASE))
782 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
783 assert_eq!(chunk.chars(), expected_chars);
784
785 for _ in 0..10 {
786 let mut start = rng.random_range(0..=chunk.text.len());
787 let mut end = rng.random_range(start..=chunk.text.len());
788 while !chunk.text.is_char_boundary(start) {
789 start -= 1;
790 }
791 while !chunk.text.is_char_boundary(end) {
792 end -= 1;
793 }
794 let range = start..end;
795 log::info!("Range: {:?}", range);
796 let text_slice = &text[range.clone()];
797 let chunk_slice = chunk.slice(range);
798 verify_chunk(chunk_slice, text_slice);
799 }
800 }
801
802 #[gpui::test(iterations = 100)]
803 fn test_split_chunk_slice(mut rng: StdRng) {
804 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
805 let chunk = Chunk::new(text);
806 let offset = char_offsets_with_end(text)
807 .into_iter()
808 .choose(&mut rng)
809 .unwrap();
810 let (a, b) = chunk.as_slice().split_at(offset);
811 let (a_str, b_str) = text.split_at(offset);
812 verify_chunk(a, a_str);
813 verify_chunk(b, b_str);
814 }
815
816 #[gpui::test(iterations = 1000)]
817 fn test_nth_set_bit_random(mut rng: StdRng) {
818 let set_count = rng.random_range(0..=128);
819 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
820 set_bits.sort();
821 let mut n = 0;
822 for ix in set_bits.iter().copied() {
823 n |= 1 << ix;
824 }
825
826 for (mut ix, position) in set_bits.into_iter().enumerate() {
827 ix += 1;
828 assert_eq!(
829 nth_set_bit(n, ix),
830 position,
831 "nth_set_bit({:0128b}, {})",
832 n,
833 ix
834 );
835 }
836 }
837
838 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
839 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
840 let mut str = String::new();
841 let mut chars = RandomCharIter::new(rng);
842 loop {
843 let ch = chars.next().unwrap();
844 if str.len() + ch.len_utf8() > len {
845 break;
846 }
847 str.push(ch);
848 }
849 str
850 }
851
852 #[gpui::test(iterations = 1000)]
853 fn test_append_random_strings(mut rng: StdRng) {
854 let len1 = rng.random_range(0..=MAX_BASE);
855 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
856 let str1 = random_string_with_utf8_len(&mut rng, len1);
857 let str2 = random_string_with_utf8_len(&mut rng, len2);
858 let mut chunk1 = Chunk::new(&str1);
859 let chunk2 = Chunk::new(&str2);
860 let char_offsets = char_offsets_with_end(&str2);
861 let start_index = rng.random_range(0..char_offsets.len());
862 let start_offset = char_offsets[start_index];
863 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
864 chunk1.append(chunk2.slice(start_offset..end_offset));
865 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
866 }
867
868 /// Return the byte offsets for each character in a string.
869 ///
870 /// These are valid offsets to split the string.
871 fn char_offsets(text: &str) -> Vec<usize> {
872 text.char_indices().map(|(i, _c)| i).collect()
873 }
874
875 /// Return the byte offsets for each character in a string, plus the offset
876 /// past the end of the string.
877 fn char_offsets_with_end(text: &str) -> Vec<usize> {
878 let mut v = char_offsets(text);
879 v.push(text.len());
880 v
881 }
882
883 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
884 let mut offset = 0;
885 let mut offset_utf16 = OffsetUtf16(0);
886 let mut point = Point::zero();
887 let mut point_utf16 = PointUtf16::zero();
888
889 log::info!("Verifying chunk {:?}", text);
890 assert_eq!(chunk.offset_to_point(0), Point::zero());
891
892 let mut expected_tab_positions = Vec::new();
893
894 for (char_offset, c) in text.chars().enumerate() {
895 let expected_point = chunk.offset_to_point(offset);
896 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
897 assert_eq!(
898 chunk.point_to_offset(point),
899 offset,
900 "mismatch at point {:?}",
901 point
902 );
903 assert_eq!(
904 chunk.offset_to_offset_utf16(offset),
905 offset_utf16,
906 "mismatch at offset {}",
907 offset
908 );
909 assert_eq!(
910 chunk.offset_utf16_to_offset(offset_utf16),
911 offset,
912 "mismatch at offset_utf16 {:?}",
913 offset_utf16
914 );
915 assert_eq!(
916 chunk.point_to_point_utf16(point),
917 point_utf16,
918 "mismatch at point {:?}",
919 point
920 );
921 assert_eq!(
922 chunk.point_utf16_to_offset(point_utf16, false),
923 offset,
924 "mismatch at point_utf16 {:?}",
925 point_utf16
926 );
927 assert_eq!(
928 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
929 point,
930 "mismatch for unclipped_point_utf16_to_point at {:?}",
931 point_utf16
932 );
933
934 assert_eq!(
935 chunk.clip_point(point, Bias::Left),
936 point,
937 "incorrect left clip at {:?}",
938 point
939 );
940 assert_eq!(
941 chunk.clip_point(point, Bias::Right),
942 point,
943 "incorrect right clip at {:?}",
944 point
945 );
946
947 for i in 1..c.len_utf8() {
948 let test_point = Point::new(point.row, point.column + i as u32);
949 assert_eq!(
950 chunk.clip_point(test_point, Bias::Left),
951 point,
952 "incorrect left clip within multi-byte char at {:?}",
953 test_point
954 );
955 assert_eq!(
956 chunk.clip_point(test_point, Bias::Right),
957 Point::new(point.row, point.column + c.len_utf8() as u32),
958 "incorrect right clip within multi-byte char at {:?}",
959 test_point
960 );
961 }
962
963 for i in 1..c.len_utf16() {
964 let test_point = Unclipped(PointUtf16::new(
965 point_utf16.row,
966 point_utf16.column + i as u32,
967 ));
968 assert_eq!(
969 chunk.unclipped_point_utf16_to_point(test_point),
970 point,
971 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
972 test_point
973 );
974 assert_eq!(
975 chunk.clip_point_utf16(test_point, Bias::Left),
976 point_utf16,
977 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
978 test_point
979 );
980 assert_eq!(
981 chunk.clip_point_utf16(test_point, Bias::Right),
982 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
983 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
984 test_point
985 );
986
987 let test_offset = OffsetUtf16(offset_utf16.0 + i);
988 assert_eq!(
989 chunk.clip_offset_utf16(test_offset, Bias::Left),
990 offset_utf16,
991 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
992 test_offset
993 );
994 assert_eq!(
995 chunk.clip_offset_utf16(test_offset, Bias::Right),
996 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
997 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
998 test_offset
999 );
1000 }
1001
1002 if c == '\n' {
1003 point.row += 1;
1004 point.column = 0;
1005 point_utf16.row += 1;
1006 point_utf16.column = 0;
1007 } else {
1008 point.column += c.len_utf8() as u32;
1009 point_utf16.column += c.len_utf16() as u32;
1010 }
1011
1012 if c == '\t' {
1013 expected_tab_positions.push(TabPosition {
1014 byte_offset: offset,
1015 char_offset,
1016 });
1017 }
1018
1019 offset += c.len_utf8();
1020 offset_utf16.0 += c.len_utf16();
1021 }
1022
1023 let final_point = chunk.offset_to_point(offset);
1024 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1025 assert_eq!(
1026 chunk.point_to_offset(point),
1027 offset,
1028 "mismatch at point {:?}",
1029 point
1030 );
1031 assert_eq!(
1032 chunk.offset_to_offset_utf16(offset),
1033 offset_utf16,
1034 "mismatch at offset {}",
1035 offset
1036 );
1037 assert_eq!(
1038 chunk.offset_utf16_to_offset(offset_utf16),
1039 offset,
1040 "mismatch at offset_utf16 {:?}",
1041 offset_utf16
1042 );
1043 assert_eq!(
1044 chunk.point_to_point_utf16(point),
1045 point_utf16,
1046 "mismatch at final point {:?}",
1047 point
1048 );
1049 assert_eq!(
1050 chunk.point_utf16_to_offset(point_utf16, false),
1051 offset,
1052 "mismatch at final point_utf16 {:?}",
1053 point_utf16
1054 );
1055 assert_eq!(
1056 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1057 point,
1058 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1059 point_utf16
1060 );
1061 assert_eq!(
1062 chunk.clip_point(point, Bias::Left),
1063 point,
1064 "incorrect left clip at final point {:?}",
1065 point
1066 );
1067 assert_eq!(
1068 chunk.clip_point(point, Bias::Right),
1069 point,
1070 "incorrect right clip at final point {:?}",
1071 point
1072 );
1073 assert_eq!(
1074 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1075 point_utf16,
1076 "incorrect left clip_point_utf16 at final point {:?}",
1077 point_utf16
1078 );
1079 assert_eq!(
1080 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1081 point_utf16,
1082 "incorrect right clip_point_utf16 at final point {:?}",
1083 point_utf16
1084 );
1085 assert_eq!(
1086 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1087 offset_utf16,
1088 "incorrect left clip_offset_utf16 at final offset {:?}",
1089 offset_utf16
1090 );
1091 assert_eq!(
1092 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1093 offset_utf16,
1094 "incorrect right clip_offset_utf16 at final offset {:?}",
1095 offset_utf16
1096 );
1097
1098 // Verify length methods
1099 assert_eq!(chunk.len(), text.len());
1100 assert_eq!(
1101 chunk.len_utf16().0,
1102 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1103 );
1104
1105 // Verify line counting
1106 let lines = chunk.lines();
1107 let mut newline_count = 0;
1108 let mut last_line_len = 0;
1109 for c in text.chars() {
1110 if c == '\n' {
1111 newline_count += 1;
1112 last_line_len = 0;
1113 } else {
1114 last_line_len += c.len_utf8() as u32;
1115 }
1116 }
1117 assert_eq!(lines, Point::new(newline_count, last_line_len));
1118
1119 // Verify first/last line chars
1120 if !text.is_empty() {
1121 let first_line = text.split('\n').next().unwrap();
1122 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1123
1124 let last_line = text.split('\n').next_back().unwrap();
1125 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1126 assert_eq!(
1127 chunk.last_line_len_utf16(),
1128 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1129 );
1130 }
1131
1132 // Verify longest row
1133 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1134 let mut max_chars = 0;
1135 let mut current_row = 0;
1136 let mut current_chars = 0;
1137 let mut max_row = 0;
1138
1139 for c in text.chars() {
1140 if c == '\n' {
1141 if current_chars > max_chars {
1142 max_chars = current_chars;
1143 max_row = current_row;
1144 }
1145 current_row += 1;
1146 current_chars = 0;
1147 } else {
1148 current_chars += 1;
1149 }
1150 }
1151
1152 if current_chars > max_chars {
1153 max_chars = current_chars;
1154 max_row = current_row;
1155 }
1156
1157 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1158 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1159 }
1160}