1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35impl Chunk {
36 pub const MASK_BITS: usize = Bitmap::BITS as usize;
37
38 #[inline(always)]
39 pub fn new(text: &str) -> Self {
40 let mut this = Chunk::default();
41 this.push_str(text);
42 this
43 }
44
45 #[inline(always)]
46 pub fn push_str(&mut self, text: &str) {
47 for (char_ix, c) in text.char_indices() {
48 let ix = self.text.len() + char_ix;
49 self.chars |= 1 << ix;
50 self.chars_utf16 |= 1 << ix;
51 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
52 self.newlines |= ((c == '\n') as Bitmap) << ix;
53 self.tabs |= ((c == '\t') as Bitmap) << ix;
54 }
55 self.text.push_str(text);
56 }
57
58 #[inline(always)]
59 pub fn append(&mut self, slice: ChunkSlice) {
60 if slice.is_empty() {
61 return;
62 };
63
64 let base_ix = self.text.len();
65 self.chars |= slice.chars << base_ix;
66 self.chars_utf16 |= slice.chars_utf16 << base_ix;
67 self.newlines |= slice.newlines << base_ix;
68 self.tabs |= slice.tabs << base_ix;
69 self.text.push_str(slice.text);
70 }
71
72 #[inline(always)]
73 pub fn as_slice(&self) -> ChunkSlice<'_> {
74 ChunkSlice {
75 chars: self.chars,
76 chars_utf16: self.chars_utf16,
77 newlines: self.newlines,
78 tabs: self.tabs,
79 text: &self.text,
80 }
81 }
82
83 #[inline(always)]
84 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
85 self.as_slice().slice(range)
86 }
87
88 #[inline(always)]
89 pub fn chars(&self) -> Bitmap {
90 self.chars
91 }
92
93 pub fn tabs(&self) -> Bitmap {
94 self.tabs
95 }
96
97 #[inline(always)]
98 pub fn is_char_boundary(&self, offset: usize) -> bool {
99 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
100 }
101
102 pub fn floor_char_boundary(&self, index: usize) -> usize {
103 #[inline]
104 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
105 // This is bit magic equivalent to: b < 128 || b >= 192
106 (u8 as i8) >= -0x40
107 }
108
109 if index >= self.text.len() {
110 self.text.len()
111 } else {
112 let mut i = index;
113 while i > 0 {
114 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
115 break;
116 }
117 i -= 1;
118 }
119
120 i
121 }
122 }
123
124 #[track_caller]
125 #[inline(always)]
126 pub fn assert_char_boundary(&self, offset: usize) {
127 if self.is_char_boundary(offset) {
128 return;
129 }
130 panic_char_boundary(self, offset);
131
132 #[cold]
133 #[inline(never)]
134 #[track_caller]
135 fn panic_char_boundary(chunk: &Chunk, offset: usize) {
136 if offset > chunk.text.len() {
137 panic!(
138 "byte index {} is out of bounds of `{:?}` (length: {})",
139 offset,
140 chunk.text,
141 chunk.text.len()
142 );
143 }
144 // find the character
145 let char_start = chunk.floor_char_boundary(offset);
146 // `char_start` must be less than len and a char boundary
147 let ch = chunk
148 .text
149 .get(char_start..)
150 .unwrap()
151 .chars()
152 .next()
153 .unwrap();
154 let char_range = char_start..char_start + ch.len_utf8();
155 panic!(
156 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
157 offset, ch, char_range,
158 );
159 }
160 }
161}
162
163#[derive(Clone, Copy, Debug)]
164pub struct ChunkSlice<'a> {
165 chars: Bitmap,
166 chars_utf16: Bitmap,
167 newlines: Bitmap,
168 tabs: Bitmap,
169 text: &'a str,
170}
171
172impl Into<Chunk> for ChunkSlice<'_> {
173 fn into(self) -> Chunk {
174 Chunk {
175 chars: self.chars,
176 chars_utf16: self.chars_utf16,
177 newlines: self.newlines,
178 tabs: self.tabs,
179 text: self.text.try_into().unwrap(),
180 }
181 }
182}
183
184impl<'a> ChunkSlice<'a> {
185 #[inline(always)]
186 pub fn is_empty(&self) -> bool {
187 self.text.is_empty()
188 }
189
190 #[inline(always)]
191 pub fn is_char_boundary(&self, offset: usize) -> bool {
192 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
193 }
194
195 #[inline(always)]
196 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
197 if mid == MAX_BASE {
198 let left = self;
199 let right = ChunkSlice {
200 chars: 0,
201 chars_utf16: 0,
202 newlines: 0,
203 tabs: 0,
204 text: "",
205 };
206 (left, right)
207 } else {
208 let mask = ((1 as Bitmap) << mid) - 1;
209 let (left_text, right_text) = self.text.split_at(mid);
210 let left = ChunkSlice {
211 chars: self.chars & mask,
212 chars_utf16: self.chars_utf16 & mask,
213 newlines: self.newlines & mask,
214 tabs: self.tabs & mask,
215 text: left_text,
216 };
217 let right = ChunkSlice {
218 chars: self.chars >> mid,
219 chars_utf16: self.chars_utf16 >> mid,
220 newlines: self.newlines >> mid,
221 tabs: self.tabs >> mid,
222 text: right_text,
223 };
224 (left, right)
225 }
226 }
227
228 #[inline(always)]
229 pub fn slice(self, range: Range<usize>) -> Self {
230 let mask = (1 as Bitmap)
231 .unbounded_shl(range.end as u32)
232 .wrapping_sub(1);
233 if range.start == MAX_BASE {
234 Self {
235 chars: 0,
236 chars_utf16: 0,
237 newlines: 0,
238 tabs: 0,
239 text: "",
240 }
241 } else {
242 self.assert_char_boundary(range.start);
243 self.assert_char_boundary(range.end);
244 Self {
245 chars: (self.chars & mask) >> range.start,
246 chars_utf16: (self.chars_utf16 & mask) >> range.start,
247 newlines: (self.newlines & mask) >> range.start,
248 tabs: (self.tabs & mask) >> range.start,
249 text: &self.text[range],
250 }
251 }
252 }
253
254 #[inline(always)]
255 pub fn text_summary(&self) -> TextSummary {
256 let mut chars = 0;
257 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
258 TextSummary {
259 len: self.len(),
260 chars,
261 len_utf16: self.len_utf16(),
262 lines: self.lines(),
263 first_line_chars: self.first_line_chars(),
264 last_line_chars: self.last_line_chars(),
265 last_line_len_utf16: self.last_line_len_utf16(),
266 longest_row,
267 longest_row_chars,
268 }
269 }
270
271 /// Get length in bytes
272 #[inline(always)]
273 pub fn len(&self) -> usize {
274 self.text.len()
275 }
276
277 /// Get length in UTF-16 code units
278 #[inline(always)]
279 pub fn len_utf16(&self) -> OffsetUtf16 {
280 OffsetUtf16(self.chars_utf16.count_ones() as usize)
281 }
282
283 /// Get point representing number of lines and length of last line
284 #[inline(always)]
285 pub fn lines(&self) -> Point {
286 let row = self.newlines.count_ones();
287 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
288 Point::new(row, column)
289 }
290
291 /// Get number of chars in first line
292 #[inline(always)]
293 pub fn first_line_chars(&self) -> u32 {
294 if self.newlines == 0 {
295 self.chars.count_ones()
296 } else {
297 let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
298 (self.chars & mask).count_ones()
299 }
300 }
301
302 /// Get number of chars in last line
303 #[inline(always)]
304 pub fn last_line_chars(&self) -> u32 {
305 if self.newlines == 0 {
306 self.chars.count_ones()
307 } else {
308 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
309 (self.chars & mask).count_ones()
310 }
311 }
312
313 /// Get number of UTF-16 code units in last line
314 #[inline(always)]
315 pub fn last_line_len_utf16(&self) -> u32 {
316 if self.newlines == 0 {
317 self.chars_utf16.count_ones()
318 } else {
319 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
320 (self.chars_utf16 & mask).count_ones()
321 }
322 }
323
324 /// Get the longest row in the chunk and its length in characters.
325 /// Calculate the total number of characters in the chunk along the way.
326 #[inline(always)]
327 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
328 let mut chars = self.chars;
329 let mut newlines = self.newlines;
330 *total_chars = 0;
331 let mut row = 0;
332 let mut longest_row = 0;
333 let mut longest_row_chars = 0;
334 while newlines > 0 {
335 let newline_ix = newlines.trailing_zeros();
336 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
337 *total_chars += usize::from(row_chars);
338 if row_chars > longest_row_chars {
339 longest_row = row;
340 longest_row_chars = row_chars;
341 }
342
343 newlines >>= newline_ix;
344 newlines >>= 1;
345 chars >>= newline_ix;
346 chars >>= 1;
347 row += 1;
348 *total_chars += 1;
349 }
350
351 let row_chars = chars.count_ones() as u8;
352 *total_chars += usize::from(row_chars);
353 if row_chars > longest_row_chars {
354 (row, row_chars as u32)
355 } else {
356 (longest_row, longest_row_chars as u32)
357 }
358 }
359
360 #[inline(always)]
361 pub fn offset_to_point(&self, offset: usize) -> Point {
362 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
363 let row = (self.newlines & mask).count_ones();
364 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
365 let column = (offset - newline_ix as usize) as u32;
366 Point::new(row, column)
367 }
368
369 #[inline(always)]
370 pub fn point_to_offset(&self, point: Point) -> usize {
371 if point.row > self.lines().row {
372 debug_panic!(
373 "point {:?} extends beyond rows for string {:?}",
374 point,
375 self.text
376 );
377 return self.len();
378 }
379
380 let row_offset_range = self.offset_range_for_row(point.row);
381 if point.column > row_offset_range.len() as u32 {
382 debug_panic!(
383 "point {:?} extends beyond row for string {:?}",
384 point,
385 self.text
386 );
387 row_offset_range.end
388 } else {
389 row_offset_range.start + point.column as usize
390 }
391 }
392
393 #[track_caller]
394 #[inline(always)]
395 pub fn assert_char_boundary(&self, offset: usize) {
396 if self.is_char_boundary(offset) {
397 return;
398 }
399 panic_char_boundary(self, offset);
400
401 #[cold]
402 #[inline(never)]
403 fn panic_char_boundary(chunk: &ChunkSlice, offset: usize) {
404 if offset > chunk.text.len() {
405 panic!(
406 "byte index {} is out of bounds of `{:?}` (length: {})",
407 offset,
408 chunk.text,
409 chunk.text.len()
410 );
411 }
412 // find the character
413 let char_start = chunk.floor_char_boundary(offset);
414 // `char_start` must be less than len and a char boundary
415 let ch = chunk
416 .text
417 .get(char_start..)
418 .unwrap()
419 .chars()
420 .next()
421 .unwrap();
422 let char_range = char_start..char_start + ch.len_utf8();
423 panic!(
424 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
425 offset, ch, char_range,
426 );
427 }
428 }
429
430 pub fn floor_char_boundary(&self, index: usize) -> usize {
431 #[inline]
432 pub(crate) const fn is_utf8_char_boundary(u8: u8) -> bool {
433 // This is bit magic equivalent to: b < 128 || b >= 192
434 (u8 as i8) >= -0x40
435 }
436
437 if index >= self.text.len() {
438 self.text.len()
439 } else {
440 let mut i = index;
441 while i > 0 {
442 if is_utf8_char_boundary(self.text.as_bytes()[i]) {
443 break;
444 }
445 i -= 1;
446 }
447
448 i
449 }
450 }
451
452 #[inline(always)]
453 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
454 if point.row > self.lines().row {
455 debug_panic!(
456 "point {:?} extends beyond rows for string {:?}",
457 point,
458 self.text
459 );
460 return self.len_utf16();
461 }
462 self.offset_to_offset_utf16(self.point_to_offset(point))
463 }
464
465 #[inline(always)]
466 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
467 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
468 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
469 }
470
471 #[inline(always)]
472 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
473 if target.0 == 0 {
474 0
475 } else {
476 #[cfg(not(test))]
477 let chars_utf16 = self.chars_utf16;
478 #[cfg(test)]
479 let chars_utf16 = self.chars_utf16 as u128;
480 let ix = nth_set_bit(chars_utf16, target.0) + 1;
481 if ix == MAX_BASE {
482 MAX_BASE
483 } else {
484 let utf8_additional_len = cmp::min(
485 (self.chars_utf16 >> ix).trailing_zeros() as usize,
486 self.text.len() - ix,
487 );
488 ix + utf8_additional_len
489 }
490 }
491 }
492
493 #[inline(always)]
494 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
495 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
496 let row = (self.newlines & mask).count_ones();
497 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
498 let column = if newline_ix as usize == MAX_BASE {
499 0
500 } else {
501 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
502 };
503 PointUtf16::new(row, column)
504 }
505
506 #[inline(always)]
507 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
508 self.offset_to_point_utf16(self.point_to_offset(point))
509 }
510
511 #[inline(always)]
512 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
513 let lines = self.lines();
514 if point.row > lines.row {
515 if !clip {
516 debug_panic!(
517 "point {:?} is beyond this chunk's extent {:?}",
518 point,
519 self.text
520 );
521 }
522 return self.len();
523 }
524
525 let row_offset_range = self.offset_range_for_row(point.row);
526 let line = self.slice(row_offset_range.clone());
527 if point.column > line.last_line_len_utf16() {
528 if !clip {
529 debug_panic!(
530 "point {:?} is beyond the end of the line in chunk {:?}",
531 point,
532 self.text
533 );
534 }
535 return line.len();
536 }
537
538 let mut offset = row_offset_range.start;
539 if point.column > 0 {
540 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
541 if !self.text.is_char_boundary(offset) {
542 offset -= 1;
543 while !self.text.is_char_boundary(offset) {
544 offset -= 1;
545 }
546 if !clip {
547 debug_panic!(
548 "point {:?} is within character in chunk {:?}",
549 point,
550 self.text,
551 );
552 }
553 }
554 }
555 offset
556 }
557
558 #[inline(always)]
559 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
560 let max_point = self.lines();
561 if point.0.row > max_point.row {
562 return max_point;
563 }
564
565 let row_offset_range = self.offset_range_for_row(point.0.row);
566 let line = self.slice(row_offset_range);
567 if point.0.column == 0 {
568 Point::new(point.0.row, 0)
569 } else if point.0.column >= line.len_utf16().0 as u32 {
570 Point::new(point.0.row, line.len() as u32)
571 } else {
572 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
573 while !line.text.is_char_boundary(column) {
574 column -= 1;
575 }
576 Point::new(point.0.row, column as u32)
577 }
578 }
579
580 #[inline(always)]
581 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
582 let max_point = self.lines();
583 if point.row > max_point.row {
584 return max_point;
585 }
586
587 let line = self.slice(self.offset_range_for_row(point.row));
588 if point.column == 0 {
589 point
590 } else if point.column >= line.len() as u32 {
591 Point::new(point.row, line.len() as u32)
592 } else {
593 let mut column = point.column as usize;
594 let bytes = line.text.as_bytes();
595 if bytes[column - 1] < 128 && bytes[column] < 128 {
596 return Point::new(point.row, column as u32);
597 }
598
599 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
600 loop {
601 if line.is_char_boundary(column)
602 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
603 {
604 break;
605 }
606
607 match bias {
608 Bias::Left => column -= 1,
609 Bias::Right => column += 1,
610 }
611 grapheme_cursor.set_cursor(column);
612 }
613 Point::new(point.row, column as u32)
614 }
615 }
616
617 #[inline(always)]
618 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
619 let max_point = self.lines();
620 if point.0.row > max_point.row {
621 PointUtf16::new(max_point.row, self.last_line_len_utf16())
622 } else {
623 let line = self.slice(self.offset_range_for_row(point.0.row));
624 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
625 PointUtf16::new(point.0.row, column.0 as u32)
626 }
627 }
628
629 #[inline(always)]
630 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
631 if target == OffsetUtf16::default() {
632 OffsetUtf16::default()
633 } else if target >= self.len_utf16() {
634 self.len_utf16()
635 } else {
636 let mut offset = self.offset_utf16_to_offset(target);
637 while !self.text.is_char_boundary(offset) {
638 if bias == Bias::Left {
639 offset -= 1;
640 } else {
641 offset += 1;
642 }
643 }
644 self.offset_to_offset_utf16(offset)
645 }
646 }
647
648 #[inline(always)]
649 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
650 let row_start = if row > 0 {
651 #[cfg(not(test))]
652 let newlines = self.newlines;
653 #[cfg(test)]
654 let newlines = self.newlines as u128;
655 nth_set_bit(newlines, row as usize) + 1
656 } else {
657 0
658 };
659 let row_len = if row_start == MAX_BASE {
660 0
661 } else {
662 cmp::min(
663 (self.newlines >> row_start).trailing_zeros(),
664 (self.text.len() - row_start) as u32,
665 )
666 };
667 row_start..row_start + row_len as usize
668 }
669
670 #[inline(always)]
671 pub fn tabs(&self) -> Tabs {
672 Tabs {
673 tabs: self.tabs,
674 chars: self.chars,
675 }
676 }
677}
678
679pub struct Tabs {
680 tabs: Bitmap,
681 chars: Bitmap,
682}
683
684#[derive(Debug, PartialEq, Eq)]
685pub struct TabPosition {
686 pub byte_offset: usize,
687 pub char_offset: usize,
688}
689
690impl Iterator for Tabs {
691 type Item = TabPosition;
692
693 fn next(&mut self) -> Option<Self::Item> {
694 if self.tabs == 0 {
695 return None;
696 }
697
698 let tab_offset = self.tabs.trailing_zeros() as usize;
699 let chars_mask = (1 << tab_offset) - 1;
700 let char_offset = (self.chars & chars_mask).count_ones() as usize;
701
702 // Since tabs are 1 byte the tab offset is the same as the byte offset
703 let position = TabPosition {
704 byte_offset: tab_offset,
705 char_offset,
706 };
707 // Remove the tab we've just seen
708 self.tabs ^= 1 << tab_offset;
709
710 Some(position)
711 }
712}
713
714/// Finds the n-th bit that is set to 1.
715#[inline(always)]
716fn nth_set_bit(v: u128, n: usize) -> usize {
717 let low = v as u64;
718 let high = (v >> 64) as u64;
719
720 let low_count = low.count_ones() as usize;
721 if n > low_count {
722 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
723 } else {
724 nth_set_bit_u64(low, n as u64) as usize
725 }
726}
727
728#[inline(always)]
729fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
730 let v = v.reverse_bits();
731 let mut s: u64 = 64;
732
733 // Parallel bit count intermediates
734 let a = v - ((v >> 1) & (u64::MAX / 3));
735 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
736 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
737 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
738
739 // Branchless select
740 let t = (d >> 32) + (d >> 48);
741 s -= (t.wrapping_sub(n) & 256) >> 3;
742 n -= t & (t.wrapping_sub(n) >> 8);
743
744 let t = (d >> (s - 16)) & 0xff;
745 s -= (t.wrapping_sub(n) & 256) >> 4;
746 n -= t & (t.wrapping_sub(n) >> 8);
747
748 let t = (c >> (s - 8)) & 0xf;
749 s -= (t.wrapping_sub(n) & 256) >> 5;
750 n -= t & (t.wrapping_sub(n) >> 8);
751
752 let t = (b >> (s - 4)) & 0x7;
753 s -= (t.wrapping_sub(n) & 256) >> 6;
754 n -= t & (t.wrapping_sub(n) >> 8);
755
756 let t = (a >> (s - 2)) & 0x3;
757 s -= (t.wrapping_sub(n) & 256) >> 7;
758 n -= t & (t.wrapping_sub(n) >> 8);
759
760 let t = (v >> (s - 1)) & 0x1;
761 s -= (t.wrapping_sub(n) & 256) >> 8;
762
763 65 - s - 1
764}
765
766#[cfg(test)]
767mod tests {
768 use super::*;
769 use rand::prelude::*;
770 use util::RandomCharIter;
771
772 #[gpui::test(iterations = 100)]
773 fn test_random_chunks(mut rng: StdRng) {
774 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
775 log::info!("Chunk: {:?}", text);
776 let chunk = Chunk::new(&text);
777 verify_chunk(chunk.as_slice(), &text);
778
779 // Verify Chunk::chars() bitmap
780 let expected_chars = char_offsets(&text)
781 .into_iter()
782 .inspect(|i| assert!(*i < MAX_BASE))
783 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
784 assert_eq!(chunk.chars(), expected_chars);
785
786 for _ in 0..10 {
787 let mut start = rng.random_range(0..=chunk.text.len());
788 let mut end = rng.random_range(start..=chunk.text.len());
789 while !chunk.text.is_char_boundary(start) {
790 start -= 1;
791 }
792 while !chunk.text.is_char_boundary(end) {
793 end -= 1;
794 }
795 let range = start..end;
796 log::info!("Range: {:?}", range);
797 let text_slice = &text[range.clone()];
798 let chunk_slice = chunk.slice(range);
799 verify_chunk(chunk_slice, text_slice);
800 }
801 }
802
803 #[gpui::test(iterations = 100)]
804 fn test_split_chunk_slice(mut rng: StdRng) {
805 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
806 let chunk = Chunk::new(text);
807 let offset = char_offsets_with_end(text)
808 .into_iter()
809 .choose(&mut rng)
810 .unwrap();
811 let (a, b) = chunk.as_slice().split_at(offset);
812 let (a_str, b_str) = text.split_at(offset);
813 verify_chunk(a, a_str);
814 verify_chunk(b, b_str);
815 }
816
817 #[gpui::test(iterations = 1000)]
818 fn test_nth_set_bit_random(mut rng: StdRng) {
819 let set_count = rng.random_range(0..=128);
820 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
821 set_bits.sort();
822 let mut n = 0;
823 for ix in set_bits.iter().copied() {
824 n |= 1 << ix;
825 }
826
827 for (mut ix, position) in set_bits.into_iter().enumerate() {
828 ix += 1;
829 assert_eq!(
830 nth_set_bit(n, ix),
831 position,
832 "nth_set_bit({:0128b}, {})",
833 n,
834 ix
835 );
836 }
837 }
838
839 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
840 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
841 let mut str = String::new();
842 let mut chars = RandomCharIter::new(rng);
843 loop {
844 let ch = chars.next().unwrap();
845 if str.len() + ch.len_utf8() > len {
846 break;
847 }
848 str.push(ch);
849 }
850 str
851 }
852
853 #[gpui::test(iterations = 1000)]
854 fn test_append_random_strings(mut rng: StdRng) {
855 let len1 = rng.random_range(0..=MAX_BASE);
856 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
857 let str1 = random_string_with_utf8_len(&mut rng, len1);
858 let str2 = random_string_with_utf8_len(&mut rng, len2);
859 let mut chunk1 = Chunk::new(&str1);
860 let chunk2 = Chunk::new(&str2);
861 let char_offsets = char_offsets_with_end(&str2);
862 let start_index = rng.random_range(0..char_offsets.len());
863 let start_offset = char_offsets[start_index];
864 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
865 chunk1.append(chunk2.slice(start_offset..end_offset));
866 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
867 }
868
869 /// Return the byte offsets for each character in a string.
870 ///
871 /// These are valid offsets to split the string.
872 fn char_offsets(text: &str) -> Vec<usize> {
873 text.char_indices().map(|(i, _c)| i).collect()
874 }
875
876 /// Return the byte offsets for each character in a string, plus the offset
877 /// past the end of the string.
878 fn char_offsets_with_end(text: &str) -> Vec<usize> {
879 let mut v = char_offsets(text);
880 v.push(text.len());
881 v
882 }
883
884 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
885 let mut offset = 0;
886 let mut offset_utf16 = OffsetUtf16(0);
887 let mut point = Point::zero();
888 let mut point_utf16 = PointUtf16::zero();
889
890 log::info!("Verifying chunk {:?}", text);
891 assert_eq!(chunk.offset_to_point(0), Point::zero());
892
893 let mut expected_tab_positions = Vec::new();
894
895 for (char_offset, c) in text.chars().enumerate() {
896 let expected_point = chunk.offset_to_point(offset);
897 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
898 assert_eq!(
899 chunk.point_to_offset(point),
900 offset,
901 "mismatch at point {:?}",
902 point
903 );
904 assert_eq!(
905 chunk.offset_to_offset_utf16(offset),
906 offset_utf16,
907 "mismatch at offset {}",
908 offset
909 );
910 assert_eq!(
911 chunk.offset_utf16_to_offset(offset_utf16),
912 offset,
913 "mismatch at offset_utf16 {:?}",
914 offset_utf16
915 );
916 assert_eq!(
917 chunk.point_to_point_utf16(point),
918 point_utf16,
919 "mismatch at point {:?}",
920 point
921 );
922 assert_eq!(
923 chunk.point_utf16_to_offset(point_utf16, false),
924 offset,
925 "mismatch at point_utf16 {:?}",
926 point_utf16
927 );
928 assert_eq!(
929 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
930 point,
931 "mismatch for unclipped_point_utf16_to_point at {:?}",
932 point_utf16
933 );
934
935 assert_eq!(
936 chunk.clip_point(point, Bias::Left),
937 point,
938 "incorrect left clip at {:?}",
939 point
940 );
941 assert_eq!(
942 chunk.clip_point(point, Bias::Right),
943 point,
944 "incorrect right clip at {:?}",
945 point
946 );
947
948 for i in 1..c.len_utf8() {
949 let test_point = Point::new(point.row, point.column + i as u32);
950 assert_eq!(
951 chunk.clip_point(test_point, Bias::Left),
952 point,
953 "incorrect left clip within multi-byte char at {:?}",
954 test_point
955 );
956 assert_eq!(
957 chunk.clip_point(test_point, Bias::Right),
958 Point::new(point.row, point.column + c.len_utf8() as u32),
959 "incorrect right clip within multi-byte char at {:?}",
960 test_point
961 );
962 }
963
964 for i in 1..c.len_utf16() {
965 let test_point = Unclipped(PointUtf16::new(
966 point_utf16.row,
967 point_utf16.column + i as u32,
968 ));
969 assert_eq!(
970 chunk.unclipped_point_utf16_to_point(test_point),
971 point,
972 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
973 test_point
974 );
975 assert_eq!(
976 chunk.clip_point_utf16(test_point, Bias::Left),
977 point_utf16,
978 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
979 test_point
980 );
981 assert_eq!(
982 chunk.clip_point_utf16(test_point, Bias::Right),
983 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
984 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
985 test_point
986 );
987
988 let test_offset = OffsetUtf16(offset_utf16.0 + i);
989 assert_eq!(
990 chunk.clip_offset_utf16(test_offset, Bias::Left),
991 offset_utf16,
992 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
993 test_offset
994 );
995 assert_eq!(
996 chunk.clip_offset_utf16(test_offset, Bias::Right),
997 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
998 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
999 test_offset
1000 );
1001 }
1002
1003 if c == '\n' {
1004 point.row += 1;
1005 point.column = 0;
1006 point_utf16.row += 1;
1007 point_utf16.column = 0;
1008 } else {
1009 point.column += c.len_utf8() as u32;
1010 point_utf16.column += c.len_utf16() as u32;
1011 }
1012
1013 if c == '\t' {
1014 expected_tab_positions.push(TabPosition {
1015 byte_offset: offset,
1016 char_offset,
1017 });
1018 }
1019
1020 offset += c.len_utf8();
1021 offset_utf16.0 += c.len_utf16();
1022 }
1023
1024 let final_point = chunk.offset_to_point(offset);
1025 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1026 assert_eq!(
1027 chunk.point_to_offset(point),
1028 offset,
1029 "mismatch at point {:?}",
1030 point
1031 );
1032 assert_eq!(
1033 chunk.offset_to_offset_utf16(offset),
1034 offset_utf16,
1035 "mismatch at offset {}",
1036 offset
1037 );
1038 assert_eq!(
1039 chunk.offset_utf16_to_offset(offset_utf16),
1040 offset,
1041 "mismatch at offset_utf16 {:?}",
1042 offset_utf16
1043 );
1044 assert_eq!(
1045 chunk.point_to_point_utf16(point),
1046 point_utf16,
1047 "mismatch at final point {:?}",
1048 point
1049 );
1050 assert_eq!(
1051 chunk.point_utf16_to_offset(point_utf16, false),
1052 offset,
1053 "mismatch at final point_utf16 {:?}",
1054 point_utf16
1055 );
1056 assert_eq!(
1057 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1058 point,
1059 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1060 point_utf16
1061 );
1062 assert_eq!(
1063 chunk.clip_point(point, Bias::Left),
1064 point,
1065 "incorrect left clip at final point {:?}",
1066 point
1067 );
1068 assert_eq!(
1069 chunk.clip_point(point, Bias::Right),
1070 point,
1071 "incorrect right clip at final point {:?}",
1072 point
1073 );
1074 assert_eq!(
1075 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1076 point_utf16,
1077 "incorrect left clip_point_utf16 at final point {:?}",
1078 point_utf16
1079 );
1080 assert_eq!(
1081 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1082 point_utf16,
1083 "incorrect right clip_point_utf16 at final point {:?}",
1084 point_utf16
1085 );
1086 assert_eq!(
1087 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1088 offset_utf16,
1089 "incorrect left clip_offset_utf16 at final offset {:?}",
1090 offset_utf16
1091 );
1092 assert_eq!(
1093 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1094 offset_utf16,
1095 "incorrect right clip_offset_utf16 at final offset {:?}",
1096 offset_utf16
1097 );
1098
1099 // Verify length methods
1100 assert_eq!(chunk.len(), text.len());
1101 assert_eq!(
1102 chunk.len_utf16().0,
1103 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1104 );
1105
1106 // Verify line counting
1107 let lines = chunk.lines();
1108 let mut newline_count = 0;
1109 let mut last_line_len = 0;
1110 for c in text.chars() {
1111 if c == '\n' {
1112 newline_count += 1;
1113 last_line_len = 0;
1114 } else {
1115 last_line_len += c.len_utf8() as u32;
1116 }
1117 }
1118 assert_eq!(lines, Point::new(newline_count, last_line_len));
1119
1120 // Verify first/last line chars
1121 if !text.is_empty() {
1122 let first_line = text.split('\n').next().unwrap();
1123 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1124
1125 let last_line = text.split('\n').next_back().unwrap();
1126 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1127 assert_eq!(
1128 chunk.last_line_len_utf16(),
1129 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1130 );
1131 }
1132
1133 // Verify longest row
1134 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1135 let mut max_chars = 0;
1136 let mut current_row = 0;
1137 let mut current_chars = 0;
1138 let mut max_row = 0;
1139
1140 for c in text.chars() {
1141 if c == '\n' {
1142 if current_chars > max_chars {
1143 max_chars = current_chars;
1144 max_row = current_row;
1145 }
1146 current_row += 1;
1147 current_chars = 0;
1148 } else {
1149 current_chars += 1;
1150 }
1151 }
1152
1153 if current_chars > max_chars {
1154 max_chars = current_chars;
1155 max_row = current_row;
1156 }
1157
1158 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1159 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1160 }
1161}