1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let mut this = Chunk::default();
51 this.push_str(text);
52 this
53 }
54
55 #[inline(always)]
56 pub fn push_str(&mut self, text: &str) {
57 for (char_ix, c) in text.char_indices() {
58 let ix = self.text.len() + char_ix;
59 self.chars |= 1 << ix;
60 self.chars_utf16 |= 1 << ix;
61 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
62 self.newlines |= ((c == '\n') as Bitmap) << ix;
63 self.tabs |= ((c == '\t') as Bitmap) << ix;
64 }
65 self.text.push_str(text);
66 }
67
68 #[inline(always)]
69 pub fn append(&mut self, slice: ChunkSlice) {
70 if slice.is_empty() {
71 return;
72 };
73
74 let base_ix = self.text.len();
75 self.chars |= slice.chars << base_ix;
76 self.chars_utf16 |= slice.chars_utf16 << base_ix;
77 self.newlines |= slice.newlines << base_ix;
78 self.tabs |= slice.tabs << base_ix;
79 self.text.push_str(slice.text);
80 }
81
82 #[inline(always)]
83 pub fn as_slice(&self) -> ChunkSlice<'_> {
84 ChunkSlice {
85 chars: self.chars,
86 chars_utf16: self.chars_utf16,
87 newlines: self.newlines,
88 tabs: self.tabs,
89 text: &self.text,
90 }
91 }
92
93 #[inline(always)]
94 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
95 self.as_slice().slice(range)
96 }
97
98 #[inline(always)]
99 pub fn chars(&self) -> Bitmap {
100 self.chars
101 }
102
103 pub fn tabs(&self) -> Bitmap {
104 self.tabs
105 }
106
107 #[inline(always)]
108 pub fn is_char_boundary(&self, offset: usize) -> bool {
109 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
110 }
111
112 pub fn floor_char_boundary(&self, index: usize) -> usize {
113 if index >= self.text.len() {
114 self.text.len()
115 } else {
116 let mut i = index;
117 while i > 0 {
118 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
119 break;
120 }
121 i -= 1;
122 }
123
124 i
125 }
126 }
127
128 #[track_caller]
129 #[inline(always)]
130 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
131 if self.is_char_boundary(offset) {
132 return true;
133 }
134 if PANIC {
135 panic_char_boundary(&self.text, offset);
136 } else {
137 log_err_char_boundary(&self.text, offset);
138 false
139 }
140 }
141}
142
143#[derive(Clone, Copy, Debug)]
144pub struct ChunkSlice<'a> {
145 chars: Bitmap,
146 chars_utf16: Bitmap,
147 newlines: Bitmap,
148 tabs: Bitmap,
149 text: &'a str,
150}
151
152impl Into<Chunk> for ChunkSlice<'_> {
153 fn into(self) -> Chunk {
154 Chunk {
155 chars: self.chars,
156 chars_utf16: self.chars_utf16,
157 newlines: self.newlines,
158 tabs: self.tabs,
159 text: self.text.try_into().unwrap(),
160 }
161 }
162}
163
164impl<'a> ChunkSlice<'a> {
165 #[inline(always)]
166 pub fn is_empty(&self) -> bool {
167 self.text.is_empty()
168 }
169
170 #[inline(always)]
171 pub fn is_char_boundary(&self, offset: usize) -> bool {
172 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
173 }
174
175 #[inline(always)]
176 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
177 if mid == MAX_BASE {
178 let left = self;
179 let right = ChunkSlice {
180 chars: 0,
181 chars_utf16: 0,
182 newlines: 0,
183 tabs: 0,
184 text: "",
185 };
186 (left, right)
187 } else {
188 let mask = ((1 as Bitmap) << mid) - 1;
189 let (left_text, right_text) = self.text.split_at(mid);
190 let left = ChunkSlice {
191 chars: self.chars & mask,
192 chars_utf16: self.chars_utf16 & mask,
193 newlines: self.newlines & mask,
194 tabs: self.tabs & mask,
195 text: left_text,
196 };
197 let right = ChunkSlice {
198 chars: self.chars >> mid,
199 chars_utf16: self.chars_utf16 >> mid,
200 newlines: self.newlines >> mid,
201 tabs: self.tabs >> mid,
202 text: right_text,
203 };
204 (left, right)
205 }
206 }
207
208 #[inline(always)]
209 pub fn slice(self, mut range: Range<usize>) -> Self {
210 if range.start == MAX_BASE {
211 Self {
212 chars: 0,
213 chars_utf16: 0,
214 newlines: 0,
215 tabs: 0,
216 text: "",
217 }
218 } else {
219 if !self.assert_char_boundary::<false>(range.start) {
220 range.start = self.text.ceil_char_boundary(range.start);
221 }
222 if !self.assert_char_boundary::<false>(range.end) {
223 range.end = self.text.floor_char_boundary(range.end);
224 }
225 let mask = (1 as Bitmap)
226 .unbounded_shl(range.end as u32)
227 .wrapping_sub(1);
228 Self {
229 chars: (self.chars & mask) >> range.start,
230 chars_utf16: (self.chars_utf16 & mask) >> range.start,
231 newlines: (self.newlines & mask) >> range.start,
232 tabs: (self.tabs & mask) >> range.start,
233 text: &self.text[range],
234 }
235 }
236 }
237
238 #[inline(always)]
239 pub fn text_summary(&self) -> TextSummary {
240 let mut chars = 0;
241 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
242 TextSummary {
243 len: self.len(),
244 chars,
245 len_utf16: self.len_utf16(),
246 lines: self.lines(),
247 first_line_chars: self.first_line_chars(),
248 last_line_chars: self.last_line_chars(),
249 last_line_len_utf16: self.last_line_len_utf16(),
250 longest_row,
251 longest_row_chars,
252 }
253 }
254
255 /// Get length in bytes
256 #[inline(always)]
257 pub fn len(&self) -> usize {
258 self.text.len()
259 }
260
261 /// Get length in UTF-16 code units
262 #[inline(always)]
263 pub fn len_utf16(&self) -> OffsetUtf16 {
264 OffsetUtf16(self.chars_utf16.count_ones() as usize)
265 }
266
267 /// Get point representing number of lines and length of last line
268 #[inline(always)]
269 pub fn lines(&self) -> Point {
270 let row = self.newlines.count_ones();
271 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
272 Point::new(row, column)
273 }
274
275 /// Get number of chars in first line
276 #[inline(always)]
277 pub fn first_line_chars(&self) -> u32 {
278 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
279 }
280
281 /// Get number of chars in last line
282 #[inline(always)]
283 pub fn last_line_chars(&self) -> u32 {
284 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
285 }
286
287 /// Get number of UTF-16 code units in last line
288 #[inline(always)]
289 pub fn last_line_len_utf16(&self) -> u32 {
290 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
291 }
292
293 /// Get the longest row in the chunk and its length in characters.
294 /// Calculate the total number of characters in the chunk along the way.
295 #[inline(always)]
296 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
297 let mut chars = self.chars;
298 let mut newlines = self.newlines;
299 *total_chars = 0;
300 let mut row = 0;
301 let mut longest_row = 0;
302 let mut longest_row_chars = 0;
303 while newlines > 0 {
304 let newline_ix = newlines.trailing_zeros();
305 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
306 *total_chars += usize::from(row_chars);
307 if row_chars > longest_row_chars {
308 longest_row = row;
309 longest_row_chars = row_chars;
310 }
311
312 newlines >>= newline_ix;
313 newlines >>= 1;
314 chars >>= newline_ix;
315 chars >>= 1;
316 row += 1;
317 *total_chars += 1;
318 }
319
320 let row_chars = chars.count_ones() as u8;
321 *total_chars += usize::from(row_chars);
322 if row_chars > longest_row_chars {
323 (row, row_chars as u32)
324 } else {
325 (longest_row, longest_row_chars as u32)
326 }
327 }
328
329 #[inline(always)]
330 pub fn offset_to_point(&self, offset: usize) -> Point {
331 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
332 let row = (self.newlines & mask).count_ones();
333 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
334 let column = (offset - newline_ix as usize) as u32;
335 Point::new(row, column)
336 }
337
338 #[inline(always)]
339 pub fn point_to_offset(&self, point: Point) -> usize {
340 if point.row > self.lines().row {
341 debug_panic!(
342 "point {:?} extends beyond rows for string {:?}",
343 point,
344 self.text
345 );
346 return self.len();
347 }
348
349 let row_offset_range = self.offset_range_for_row(point.row);
350 if point.column > row_offset_range.len() as u32 {
351 debug_panic!(
352 "point {:?} extends beyond row for string {:?}",
353 point,
354 self.text
355 );
356 row_offset_range.end
357 } else {
358 row_offset_range.start + point.column as usize
359 }
360 }
361
362 #[track_caller]
363 #[inline(always)]
364 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
365 if self.is_char_boundary(offset) {
366 return true;
367 }
368 if PANIC {
369 panic_char_boundary(self.text, offset);
370 } else {
371 log_err_char_boundary(self.text, offset);
372 false
373 }
374 }
375
376 pub fn floor_char_boundary(&self, index: usize) -> usize {
377 self.text.floor_char_boundary(index)
378 }
379
380 #[inline(always)]
381 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
382 if point.row > self.lines().row {
383 debug_panic!(
384 "point {:?} extends beyond rows for string {:?}",
385 point,
386 self.text
387 );
388 return self.len_utf16();
389 }
390 self.offset_to_offset_utf16(self.point_to_offset(point))
391 }
392
393 #[inline(always)]
394 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
395 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
396 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
397 }
398
399 #[inline(always)]
400 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
401 if target.0 == 0 {
402 0
403 } else {
404 #[cfg(not(test))]
405 let chars_utf16 = self.chars_utf16;
406 #[cfg(test)]
407 let chars_utf16 = self.chars_utf16 as u128;
408 let ix = nth_set_bit(chars_utf16, target.0) + 1;
409 if ix == MAX_BASE {
410 MAX_BASE
411 } else {
412 let utf8_additional_len = cmp::min(
413 (self.chars_utf16 >> ix).trailing_zeros() as usize,
414 self.text.len() - ix,
415 );
416 ix + utf8_additional_len
417 }
418 }
419 }
420
421 #[inline(always)]
422 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
423 let mask = saturating_shl_mask(offset as u32);
424 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
425 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
426 let column = if newline_ix as usize == MAX_BASE {
427 0
428 } else {
429 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
430 };
431 PointUtf16::new(row, column)
432 }
433
434 #[inline(always)]
435 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
436 self.offset_to_point_utf16(self.point_to_offset(point))
437 }
438
439 #[inline(always)]
440 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
441 let lines = self.lines();
442 if point.row > lines.row {
443 if !clip {
444 debug_panic!(
445 "point {:?} is beyond this chunk's extent {:?}",
446 point,
447 self.text
448 );
449 }
450 return self.len();
451 }
452
453 let row_offset_range = self.offset_range_for_row(point.row);
454 let line = self.slice(row_offset_range.clone());
455 if point.column > line.last_line_len_utf16() {
456 if !clip {
457 debug_panic!(
458 "point {:?} is beyond the end of the line in chunk {:?}",
459 point,
460 self.text
461 );
462 }
463 return line.len();
464 }
465
466 let mut offset = row_offset_range.start;
467 if point.column > 0 {
468 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
469 if !self.text.is_char_boundary(offset) {
470 offset -= 1;
471 while !self.text.is_char_boundary(offset) {
472 offset -= 1;
473 }
474 if !clip {
475 debug_panic!(
476 "point {:?} is within character in chunk {:?}",
477 point,
478 self.text,
479 );
480 }
481 }
482 }
483 offset
484 }
485
486 #[inline(always)]
487 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
488 let max_point = self.lines();
489 if point.0.row > max_point.row {
490 return max_point;
491 }
492
493 let row_offset_range = self.offset_range_for_row(point.0.row);
494 let line = self.slice(row_offset_range);
495 if point.0.column == 0 {
496 Point::new(point.0.row, 0)
497 } else if point.0.column >= line.len_utf16().0 as u32 {
498 Point::new(point.0.row, line.len() as u32)
499 } else {
500 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
501 while !line.text.is_char_boundary(column) {
502 column -= 1;
503 }
504 Point::new(point.0.row, column as u32)
505 }
506 }
507
508 #[inline(always)]
509 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
510 let max_point = self.lines();
511 if point.row > max_point.row {
512 return max_point;
513 }
514
515 let line = self.slice(self.offset_range_for_row(point.row));
516 if point.column == 0 {
517 point
518 } else if point.column >= line.len() as u32 {
519 Point::new(point.row, line.len() as u32)
520 } else {
521 let mut column = point.column as usize;
522 let bytes = line.text.as_bytes();
523 if bytes[column - 1] < 128 && bytes[column] < 128 {
524 return Point::new(point.row, column as u32);
525 }
526
527 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
528 loop {
529 if line.is_char_boundary(column)
530 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
531 {
532 break;
533 }
534
535 match bias {
536 Bias::Left => column -= 1,
537 Bias::Right => column += 1,
538 }
539 grapheme_cursor.set_cursor(column);
540 }
541 Point::new(point.row, column as u32)
542 }
543 }
544
545 #[inline(always)]
546 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
547 let max_point = self.lines();
548 if point.0.row > max_point.row {
549 PointUtf16::new(max_point.row, self.last_line_len_utf16())
550 } else {
551 let line = self.slice(self.offset_range_for_row(point.0.row));
552 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
553 PointUtf16::new(point.0.row, column.0 as u32)
554 }
555 }
556
557 #[inline(always)]
558 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
559 if target == OffsetUtf16::default() {
560 OffsetUtf16::default()
561 } else if target >= self.len_utf16() {
562 self.len_utf16()
563 } else {
564 let mut offset = self.offset_utf16_to_offset(target);
565 while !self.text.is_char_boundary(offset) {
566 if bias == Bias::Left {
567 offset -= 1;
568 } else {
569 offset += 1;
570 }
571 }
572 self.offset_to_offset_utf16(offset)
573 }
574 }
575
576 #[inline(always)]
577 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
578 let row_start = if row > 0 {
579 #[cfg(not(test))]
580 let newlines = self.newlines;
581 #[cfg(test)]
582 let newlines = self.newlines as u128;
583 nth_set_bit(newlines, row as usize) + 1
584 } else {
585 0
586 };
587 let row_len = if row_start == MAX_BASE {
588 0
589 } else {
590 cmp::min(
591 (self.newlines >> row_start).trailing_zeros(),
592 (self.text.len() - row_start) as u32,
593 )
594 };
595 row_start..row_start + row_len as usize
596 }
597
598 #[inline(always)]
599 pub fn tabs(&self) -> Tabs {
600 Tabs {
601 tabs: self.tabs,
602 chars: self.chars,
603 }
604 }
605}
606
607pub struct Tabs {
608 tabs: Bitmap,
609 chars: Bitmap,
610}
611
612#[derive(Debug, PartialEq, Eq)]
613pub struct TabPosition {
614 pub byte_offset: usize,
615 pub char_offset: usize,
616}
617
618impl Iterator for Tabs {
619 type Item = TabPosition;
620
621 fn next(&mut self) -> Option<Self::Item> {
622 if self.tabs == 0 {
623 return None;
624 }
625
626 let tab_offset = self.tabs.trailing_zeros() as usize;
627 let chars_mask = (1 << tab_offset) - 1;
628 let char_offset = (self.chars & chars_mask).count_ones() as usize;
629
630 // Since tabs are 1 byte the tab offset is the same as the byte offset
631 let position = TabPosition {
632 byte_offset: tab_offset,
633 char_offset,
634 };
635 // Remove the tab we've just seen
636 self.tabs ^= 1 << tab_offset;
637
638 Some(position)
639 }
640}
641
642/// Finds the n-th bit that is set to 1.
643#[inline(always)]
644fn nth_set_bit(v: u128, n: usize) -> usize {
645 let low = v as u64;
646 let high = (v >> 64) as u64;
647
648 let low_count = low.count_ones() as usize;
649 if n > low_count {
650 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
651 } else {
652 nth_set_bit_u64(low, n as u64) as usize
653 }
654}
655
656#[cold]
657#[inline(never)]
658#[track_caller]
659fn panic_char_boundary(text: &str, offset: usize) -> ! {
660 if offset > text.len() {
661 panic!(
662 "byte index {} is out of bounds of `{:?}` (length: {})",
663 offset,
664 text,
665 text.len()
666 );
667 }
668 // find the character
669 let char_start = text.floor_char_boundary(offset);
670 // `char_start` must be less than len and a char boundary
671 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
672 let char_range = char_start..char_start + ch.len_utf8();
673 panic!(
674 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
675 offset, ch, char_range,
676 );
677}
678
679#[cold]
680#[inline(never)]
681#[track_caller]
682fn log_err_char_boundary(text: &str, offset: usize) {
683 if offset > text.len() {
684 log::error!(
685 "byte index {} is out of bounds of `{:?}` (length: {})",
686 offset,
687 text,
688 text.len()
689 );
690 }
691 // find the character
692 let char_start = text.floor_char_boundary(offset);
693 // `char_start` must be less than len and a char boundary
694 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
695 let char_range = char_start..char_start + ch.len_utf8();
696 log::error!(
697 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
698 offset,
699 ch,
700 char_range,
701 );
702}
703
704#[inline(always)]
705fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
706 let v = v.reverse_bits();
707 let mut s: u64 = 64;
708
709 // Parallel bit count intermediates
710 let a = v - ((v >> 1) & (u64::MAX / 3));
711 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
712 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
713 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
714
715 // Branchless select
716 let t = (d >> 32) + (d >> 48);
717 s -= (t.wrapping_sub(n) & 256) >> 3;
718 n -= t & (t.wrapping_sub(n) >> 8);
719
720 let t = (d >> (s - 16)) & 0xff;
721 s -= (t.wrapping_sub(n) & 256) >> 4;
722 n -= t & (t.wrapping_sub(n) >> 8);
723
724 let t = (c >> (s - 8)) & 0xf;
725 s -= (t.wrapping_sub(n) & 256) >> 5;
726 n -= t & (t.wrapping_sub(n) >> 8);
727
728 let t = (b >> (s - 4)) & 0x7;
729 s -= (t.wrapping_sub(n) & 256) >> 6;
730 n -= t & (t.wrapping_sub(n) >> 8);
731
732 let t = (a >> (s - 2)) & 0x3;
733 s -= (t.wrapping_sub(n) & 256) >> 7;
734 n -= t & (t.wrapping_sub(n) >> 8);
735
736 let t = (v >> (s - 1)) & 0x1;
737 s -= (t.wrapping_sub(n) & 256) >> 8;
738
739 65 - s - 1
740}
741
742#[cfg(test)]
743mod tests {
744 use super::*;
745 use rand::prelude::*;
746 use util::RandomCharIter;
747
748 #[gpui::test(iterations = 100)]
749 fn test_random_chunks(mut rng: StdRng) {
750 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
751 log::info!("Chunk: {:?}", text);
752 let chunk = Chunk::new(&text);
753 verify_chunk(chunk.as_slice(), &text);
754
755 // Verify Chunk::chars() bitmap
756 let expected_chars = char_offsets(&text)
757 .into_iter()
758 .inspect(|i| assert!(*i < MAX_BASE))
759 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
760 assert_eq!(chunk.chars(), expected_chars);
761
762 for _ in 0..10 {
763 let mut start = rng.random_range(0..=chunk.text.len());
764 let mut end = rng.random_range(start..=chunk.text.len());
765 while !chunk.text.is_char_boundary(start) {
766 start -= 1;
767 }
768 while !chunk.text.is_char_boundary(end) {
769 end -= 1;
770 }
771 let range = start..end;
772 log::info!("Range: {:?}", range);
773 let text_slice = &text[range.clone()];
774 let chunk_slice = chunk.slice(range);
775 verify_chunk(chunk_slice, text_slice);
776 }
777 }
778
779 #[gpui::test(iterations = 100)]
780 fn test_split_chunk_slice(mut rng: StdRng) {
781 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
782 let chunk = Chunk::new(text);
783 let offset = char_offsets_with_end(text)
784 .into_iter()
785 .choose(&mut rng)
786 .unwrap();
787 let (a, b) = chunk.as_slice().split_at(offset);
788 let (a_str, b_str) = text.split_at(offset);
789 verify_chunk(a, a_str);
790 verify_chunk(b, b_str);
791 }
792
793 #[gpui::test(iterations = 1000)]
794 fn test_nth_set_bit_random(mut rng: StdRng) {
795 let set_count = rng.random_range(0..=128);
796 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
797 set_bits.sort();
798 let mut n = 0;
799 for ix in set_bits.iter().copied() {
800 n |= 1 << ix;
801 }
802
803 for (mut ix, position) in set_bits.into_iter().enumerate() {
804 ix += 1;
805 assert_eq!(
806 nth_set_bit(n, ix),
807 position,
808 "nth_set_bit({:0128b}, {})",
809 n,
810 ix
811 );
812 }
813 }
814
815 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
816 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
817 let mut str = String::new();
818 let mut chars = RandomCharIter::new(rng);
819 loop {
820 let ch = chars.next().unwrap();
821 if str.len() + ch.len_utf8() > len {
822 break;
823 }
824 str.push(ch);
825 }
826 str
827 }
828
829 #[gpui::test(iterations = 1000)]
830 fn test_append_random_strings(mut rng: StdRng) {
831 let len1 = rng.random_range(0..=MAX_BASE);
832 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
833 let str1 = random_string_with_utf8_len(&mut rng, len1);
834 let str2 = random_string_with_utf8_len(&mut rng, len2);
835 let mut chunk1 = Chunk::new(&str1);
836 let chunk2 = Chunk::new(&str2);
837 let char_offsets = char_offsets_with_end(&str2);
838 let start_index = rng.random_range(0..char_offsets.len());
839 let start_offset = char_offsets[start_index];
840 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
841 chunk1.append(chunk2.slice(start_offset..end_offset));
842 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
843 }
844
845 /// Return the byte offsets for each character in a string.
846 ///
847 /// These are valid offsets to split the string.
848 fn char_offsets(text: &str) -> Vec<usize> {
849 text.char_indices().map(|(i, _c)| i).collect()
850 }
851
852 /// Return the byte offsets for each character in a string, plus the offset
853 /// past the end of the string.
854 fn char_offsets_with_end(text: &str) -> Vec<usize> {
855 let mut v = char_offsets(text);
856 v.push(text.len());
857 v
858 }
859
860 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
861 let mut offset = 0;
862 let mut offset_utf16 = OffsetUtf16(0);
863 let mut point = Point::zero();
864 let mut point_utf16 = PointUtf16::zero();
865
866 log::info!("Verifying chunk {:?}", text);
867 assert_eq!(chunk.offset_to_point(0), Point::zero());
868
869 let mut expected_tab_positions = Vec::new();
870
871 for (char_offset, c) in text.chars().enumerate() {
872 let expected_point = chunk.offset_to_point(offset);
873 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
874 assert_eq!(
875 chunk.point_to_offset(point),
876 offset,
877 "mismatch at point {:?}",
878 point
879 );
880 assert_eq!(
881 chunk.offset_to_offset_utf16(offset),
882 offset_utf16,
883 "mismatch at offset {}",
884 offset
885 );
886 assert_eq!(
887 chunk.offset_utf16_to_offset(offset_utf16),
888 offset,
889 "mismatch at offset_utf16 {:?}",
890 offset_utf16
891 );
892 assert_eq!(
893 chunk.point_to_point_utf16(point),
894 point_utf16,
895 "mismatch at point {:?}",
896 point
897 );
898 assert_eq!(
899 chunk.point_utf16_to_offset(point_utf16, false),
900 offset,
901 "mismatch at point_utf16 {:?}",
902 point_utf16
903 );
904 assert_eq!(
905 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
906 point,
907 "mismatch for unclipped_point_utf16_to_point at {:?}",
908 point_utf16
909 );
910
911 assert_eq!(
912 chunk.clip_point(point, Bias::Left),
913 point,
914 "incorrect left clip at {:?}",
915 point
916 );
917 assert_eq!(
918 chunk.clip_point(point, Bias::Right),
919 point,
920 "incorrect right clip at {:?}",
921 point
922 );
923
924 for i in 1..c.len_utf8() {
925 let test_point = Point::new(point.row, point.column + i as u32);
926 assert_eq!(
927 chunk.clip_point(test_point, Bias::Left),
928 point,
929 "incorrect left clip within multi-byte char at {:?}",
930 test_point
931 );
932 assert_eq!(
933 chunk.clip_point(test_point, Bias::Right),
934 Point::new(point.row, point.column + c.len_utf8() as u32),
935 "incorrect right clip within multi-byte char at {:?}",
936 test_point
937 );
938 }
939
940 for i in 1..c.len_utf16() {
941 let test_point = Unclipped(PointUtf16::new(
942 point_utf16.row,
943 point_utf16.column + i as u32,
944 ));
945 assert_eq!(
946 chunk.unclipped_point_utf16_to_point(test_point),
947 point,
948 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
949 test_point
950 );
951 assert_eq!(
952 chunk.clip_point_utf16(test_point, Bias::Left),
953 point_utf16,
954 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
955 test_point
956 );
957 assert_eq!(
958 chunk.clip_point_utf16(test_point, Bias::Right),
959 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
960 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
961 test_point
962 );
963
964 let test_offset = OffsetUtf16(offset_utf16.0 + i);
965 assert_eq!(
966 chunk.clip_offset_utf16(test_offset, Bias::Left),
967 offset_utf16,
968 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
969 test_offset
970 );
971 assert_eq!(
972 chunk.clip_offset_utf16(test_offset, Bias::Right),
973 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
974 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
975 test_offset
976 );
977 }
978
979 if c == '\n' {
980 point.row += 1;
981 point.column = 0;
982 point_utf16.row += 1;
983 point_utf16.column = 0;
984 } else {
985 point.column += c.len_utf8() as u32;
986 point_utf16.column += c.len_utf16() as u32;
987 }
988
989 if c == '\t' {
990 expected_tab_positions.push(TabPosition {
991 byte_offset: offset,
992 char_offset,
993 });
994 }
995
996 offset += c.len_utf8();
997 offset_utf16.0 += c.len_utf16();
998 }
999
1000 let final_point = chunk.offset_to_point(offset);
1001 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1002 assert_eq!(
1003 chunk.point_to_offset(point),
1004 offset,
1005 "mismatch at point {:?}",
1006 point
1007 );
1008 assert_eq!(
1009 chunk.offset_to_offset_utf16(offset),
1010 offset_utf16,
1011 "mismatch at offset {}",
1012 offset
1013 );
1014 assert_eq!(
1015 chunk.offset_utf16_to_offset(offset_utf16),
1016 offset,
1017 "mismatch at offset_utf16 {:?}",
1018 offset_utf16
1019 );
1020 assert_eq!(
1021 chunk.point_to_point_utf16(point),
1022 point_utf16,
1023 "mismatch at final point {:?}",
1024 point
1025 );
1026 assert_eq!(
1027 chunk.point_utf16_to_offset(point_utf16, false),
1028 offset,
1029 "mismatch at final point_utf16 {:?}",
1030 point_utf16
1031 );
1032 assert_eq!(
1033 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1034 point,
1035 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1036 point_utf16
1037 );
1038 assert_eq!(
1039 chunk.clip_point(point, Bias::Left),
1040 point,
1041 "incorrect left clip at final point {:?}",
1042 point
1043 );
1044 assert_eq!(
1045 chunk.clip_point(point, Bias::Right),
1046 point,
1047 "incorrect right clip at final point {:?}",
1048 point
1049 );
1050 assert_eq!(
1051 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1052 point_utf16,
1053 "incorrect left clip_point_utf16 at final point {:?}",
1054 point_utf16
1055 );
1056 assert_eq!(
1057 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1058 point_utf16,
1059 "incorrect right clip_point_utf16 at final point {:?}",
1060 point_utf16
1061 );
1062 assert_eq!(
1063 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1064 offset_utf16,
1065 "incorrect left clip_offset_utf16 at final offset {:?}",
1066 offset_utf16
1067 );
1068 assert_eq!(
1069 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1070 offset_utf16,
1071 "incorrect right clip_offset_utf16 at final offset {:?}",
1072 offset_utf16
1073 );
1074
1075 // Verify length methods
1076 assert_eq!(chunk.len(), text.len());
1077 assert_eq!(
1078 chunk.len_utf16().0,
1079 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1080 );
1081
1082 // Verify line counting
1083 let lines = chunk.lines();
1084 let mut newline_count = 0;
1085 let mut last_line_len = 0;
1086 for c in text.chars() {
1087 if c == '\n' {
1088 newline_count += 1;
1089 last_line_len = 0;
1090 } else {
1091 last_line_len += c.len_utf8() as u32;
1092 }
1093 }
1094 assert_eq!(lines, Point::new(newline_count, last_line_len));
1095
1096 // Verify first/last line chars
1097 if !text.is_empty() {
1098 let first_line = text.split('\n').next().unwrap();
1099 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1100
1101 let last_line = text.split('\n').next_back().unwrap();
1102 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1103 assert_eq!(
1104 chunk.last_line_len_utf16(),
1105 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1106 );
1107 }
1108
1109 // Verify longest row
1110 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1111 let mut max_chars = 0;
1112 let mut current_row = 0;
1113 let mut current_chars = 0;
1114 let mut max_row = 0;
1115
1116 for c in text.chars() {
1117 if c == '\n' {
1118 if current_chars > max_chars {
1119 max_chars = current_chars;
1120 max_row = current_row;
1121 }
1122 current_row += 1;
1123 current_chars = 0;
1124 } else {
1125 current_chars += 1;
1126 }
1127 }
1128
1129 if current_chars > max_chars {
1130 max_chars = current_chars;
1131 max_row = current_row;
1132 }
1133
1134 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1135 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1136 }
1137}