1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let mut this = Chunk::default();
51 this.push_str(text);
52 this
53 }
54
55 #[inline(always)]
56 pub fn push_str(&mut self, text: &str) {
57 for (char_ix, c) in text.char_indices() {
58 let ix = self.text.len() + char_ix;
59 self.chars |= 1 << ix;
60 self.chars_utf16 |= 1 << ix;
61 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
62 self.newlines |= ((c == '\n') as Bitmap) << ix;
63 self.tabs |= ((c == '\t') as Bitmap) << ix;
64 }
65 self.text.push_str(text);
66 }
67
68 #[inline(always)]
69 pub fn append(&mut self, slice: ChunkSlice) {
70 if slice.is_empty() {
71 return;
72 };
73
74 let base_ix = self.text.len();
75 self.chars |= slice.chars << base_ix;
76 self.chars_utf16 |= slice.chars_utf16 << base_ix;
77 self.newlines |= slice.newlines << base_ix;
78 self.tabs |= slice.tabs << base_ix;
79 self.text.push_str(slice.text);
80 }
81
82 #[inline(always)]
83 pub fn as_slice(&self) -> ChunkSlice<'_> {
84 ChunkSlice {
85 chars: self.chars,
86 chars_utf16: self.chars_utf16,
87 newlines: self.newlines,
88 tabs: self.tabs,
89 text: &self.text,
90 }
91 }
92
93 #[inline(always)]
94 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
95 self.as_slice().slice(range)
96 }
97
98 #[inline(always)]
99 pub fn chars(&self) -> Bitmap {
100 self.chars
101 }
102
103 pub fn tabs(&self) -> Bitmap {
104 self.tabs
105 }
106
107 #[inline(always)]
108 pub fn is_char_boundary(&self, offset: usize) -> bool {
109 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
110 }
111
112 pub fn floor_char_boundary(&self, index: usize) -> usize {
113 if index >= self.text.len() {
114 self.text.len()
115 } else {
116 let mut i = index;
117 while i > 0 {
118 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
119 break;
120 }
121 i -= 1;
122 }
123
124 i
125 }
126 }
127
128 #[track_caller]
129 #[inline(always)]
130 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
131 if self.is_char_boundary(offset) {
132 return true;
133 }
134 if PANIC {
135 panic_char_boundary(&self.text, offset);
136 } else {
137 log_err_char_boundary(&self.text, offset);
138 false
139 }
140 }
141}
142
143#[derive(Clone, Copy, Debug)]
144pub struct ChunkSlice<'a> {
145 chars: Bitmap,
146 chars_utf16: Bitmap,
147 newlines: Bitmap,
148 tabs: Bitmap,
149 text: &'a str,
150}
151
152impl Into<Chunk> for ChunkSlice<'_> {
153 fn into(self) -> Chunk {
154 Chunk {
155 chars: self.chars,
156 chars_utf16: self.chars_utf16,
157 newlines: self.newlines,
158 tabs: self.tabs,
159 text: self.text.try_into().unwrap(),
160 }
161 }
162}
163
164impl<'a> ChunkSlice<'a> {
165 #[inline(always)]
166 pub fn is_empty(&self) -> bool {
167 self.text.is_empty()
168 }
169
170 #[inline(always)]
171 pub fn is_char_boundary(&self, offset: usize) -> bool {
172 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
173 }
174
175 #[inline(always)]
176 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
177 if mid == MAX_BASE {
178 let left = self;
179 let right = ChunkSlice {
180 chars: 0,
181 chars_utf16: 0,
182 newlines: 0,
183 tabs: 0,
184 text: "",
185 };
186 (left, right)
187 } else {
188 let mask = ((1 as Bitmap) << mid) - 1;
189 let (left_text, right_text) = self.text.split_at(mid);
190 let left = ChunkSlice {
191 chars: self.chars & mask,
192 chars_utf16: self.chars_utf16 & mask,
193 newlines: self.newlines & mask,
194 tabs: self.tabs & mask,
195 text: left_text,
196 };
197 let right = ChunkSlice {
198 chars: self.chars >> mid,
199 chars_utf16: self.chars_utf16 >> mid,
200 newlines: self.newlines >> mid,
201 tabs: self.tabs >> mid,
202 text: right_text,
203 };
204 (left, right)
205 }
206 }
207
208 #[inline(always)]
209 pub fn slice(self, mut range: Range<usize>) -> Self {
210 if range.start == MAX_BASE {
211 Self {
212 chars: 0,
213 chars_utf16: 0,
214 newlines: 0,
215 tabs: 0,
216 text: "",
217 }
218 } else {
219 if !self.assert_char_boundary::<false>(range.start) {
220 range.start = self.text.ceil_char_boundary(range.start);
221 }
222 if !self.assert_char_boundary::<false>(range.end) {
223 range.end = if range.end < range.start {
224 range.start
225 } else {
226 self.text.floor_char_boundary(range.end)
227 };
228 }
229 let mask = (1 as Bitmap)
230 .unbounded_shl(range.end as u32)
231 .wrapping_sub(1);
232 Self {
233 chars: (self.chars & mask) >> range.start,
234 chars_utf16: (self.chars_utf16 & mask) >> range.start,
235 newlines: (self.newlines & mask) >> range.start,
236 tabs: (self.tabs & mask) >> range.start,
237 text: &self.text[range],
238 }
239 }
240 }
241
242 #[inline(always)]
243 pub fn text_summary(&self) -> TextSummary {
244 let mut chars = 0;
245 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
246 TextSummary {
247 len: self.len(),
248 chars,
249 len_utf16: self.len_utf16(),
250 lines: self.lines(),
251 first_line_chars: self.first_line_chars(),
252 last_line_chars: self.last_line_chars(),
253 last_line_len_utf16: self.last_line_len_utf16(),
254 longest_row,
255 longest_row_chars,
256 }
257 }
258
259 /// Get length in bytes
260 #[inline(always)]
261 pub fn len(&self) -> usize {
262 self.text.len()
263 }
264
265 /// Get length in UTF-16 code units
266 #[inline(always)]
267 pub fn len_utf16(&self) -> OffsetUtf16 {
268 OffsetUtf16(self.chars_utf16.count_ones() as usize)
269 }
270
271 /// Get point representing number of lines and length of last line
272 #[inline(always)]
273 pub fn lines(&self) -> Point {
274 let row = self.newlines.count_ones();
275 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
276 Point::new(row, column)
277 }
278
279 /// Get number of chars in first line
280 #[inline(always)]
281 pub fn first_line_chars(&self) -> u32 {
282 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
283 }
284
285 /// Get number of chars in last line
286 #[inline(always)]
287 pub fn last_line_chars(&self) -> u32 {
288 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
289 }
290
291 /// Get number of UTF-16 code units in last line
292 #[inline(always)]
293 pub fn last_line_len_utf16(&self) -> u32 {
294 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
295 }
296
297 /// Get the longest row in the chunk and its length in characters.
298 /// Calculate the total number of characters in the chunk along the way.
299 #[inline(always)]
300 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
301 let mut chars = self.chars;
302 let mut newlines = self.newlines;
303 *total_chars = 0;
304 let mut row = 0;
305 let mut longest_row = 0;
306 let mut longest_row_chars = 0;
307 while newlines > 0 {
308 let newline_ix = newlines.trailing_zeros();
309 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
310 *total_chars += usize::from(row_chars);
311 if row_chars > longest_row_chars {
312 longest_row = row;
313 longest_row_chars = row_chars;
314 }
315
316 newlines >>= newline_ix;
317 newlines >>= 1;
318 chars >>= newline_ix;
319 chars >>= 1;
320 row += 1;
321 *total_chars += 1;
322 }
323
324 let row_chars = chars.count_ones() as u8;
325 *total_chars += usize::from(row_chars);
326 if row_chars > longest_row_chars {
327 (row, row_chars as u32)
328 } else {
329 (longest_row, longest_row_chars as u32)
330 }
331 }
332
333 #[inline(always)]
334 pub fn offset_to_point(&self, offset: usize) -> Point {
335 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
336 let row = (self.newlines & mask).count_ones();
337 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
338 let column = (offset - newline_ix as usize) as u32;
339 Point::new(row, column)
340 }
341
342 #[inline(always)]
343 pub fn point_to_offset(&self, point: Point) -> usize {
344 if point.row > self.lines().row {
345 debug_panic!(
346 "point {:?} extends beyond rows for string {:?}",
347 point,
348 self.text
349 );
350 return self.len();
351 }
352
353 let row_offset_range = self.offset_range_for_row(point.row);
354 if point.column > row_offset_range.len() as u32 {
355 debug_panic!(
356 "point {:?} extends beyond row for string {:?}",
357 point,
358 self.text
359 );
360 row_offset_range.end
361 } else {
362 row_offset_range.start + point.column as usize
363 }
364 }
365
366 #[track_caller]
367 #[inline(always)]
368 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
369 if self.is_char_boundary(offset) {
370 return true;
371 }
372 if PANIC {
373 panic_char_boundary(self.text, offset);
374 } else {
375 log_err_char_boundary(self.text, offset);
376 false
377 }
378 }
379
380 pub fn floor_char_boundary(&self, index: usize) -> usize {
381 self.text.floor_char_boundary(index)
382 }
383
384 #[inline(always)]
385 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
386 if point.row > self.lines().row {
387 debug_panic!(
388 "point {:?} extends beyond rows for string {:?}",
389 point,
390 self.text
391 );
392 return self.len_utf16();
393 }
394 self.offset_to_offset_utf16(self.point_to_offset(point))
395 }
396
397 #[inline(always)]
398 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
399 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
400 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
401 }
402
403 #[inline(always)]
404 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
405 if target.0 == 0 {
406 0
407 } else {
408 #[cfg(not(test))]
409 let chars_utf16 = self.chars_utf16;
410 #[cfg(test)]
411 let chars_utf16 = self.chars_utf16 as u128;
412 let ix = nth_set_bit(chars_utf16, target.0) + 1;
413 if ix == MAX_BASE {
414 MAX_BASE
415 } else {
416 let utf8_additional_len = cmp::min(
417 (self.chars_utf16 >> ix).trailing_zeros() as usize,
418 self.text.len() - ix,
419 );
420 ix + utf8_additional_len
421 }
422 }
423 }
424
425 #[inline(always)]
426 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
427 let mask = saturating_shl_mask(offset as u32);
428 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
429 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
430 let column = if newline_ix as usize == MAX_BASE {
431 0
432 } else {
433 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
434 };
435 PointUtf16::new(row, column)
436 }
437
438 #[inline(always)]
439 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
440 self.offset_to_point_utf16(self.point_to_offset(point))
441 }
442
443 #[inline(always)]
444 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
445 let lines = self.lines();
446 if point.row > lines.row {
447 if !clip {
448 debug_panic!(
449 "point {:?} is beyond this chunk's extent {:?}",
450 point,
451 self.text
452 );
453 }
454 return self.len();
455 }
456
457 let row_offset_range = self.offset_range_for_row(point.row);
458 let line = self.slice(row_offset_range.clone());
459 if point.column > line.last_line_len_utf16() {
460 if !clip {
461 debug_panic!(
462 "point {:?} is beyond the end of the line in chunk {:?}",
463 point,
464 self.text
465 );
466 }
467 return line.len();
468 }
469
470 let mut offset = row_offset_range.start;
471 if point.column > 0 {
472 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
473 if !self.text.is_char_boundary(offset) {
474 offset -= 1;
475 while !self.text.is_char_boundary(offset) {
476 offset -= 1;
477 }
478 if !clip {
479 debug_panic!(
480 "point {:?} is within character in chunk {:?}",
481 point,
482 self.text,
483 );
484 }
485 }
486 }
487 offset
488 }
489
490 #[inline(always)]
491 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
492 let max_point = self.lines();
493 if point.0.row > max_point.row {
494 return max_point;
495 }
496
497 let row_offset_range = self.offset_range_for_row(point.0.row);
498 let line = self.slice(row_offset_range);
499 if point.0.column == 0 {
500 Point::new(point.0.row, 0)
501 } else if point.0.column >= line.len_utf16().0 as u32 {
502 Point::new(point.0.row, line.len() as u32)
503 } else {
504 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
505 while !line.text.is_char_boundary(column) {
506 column -= 1;
507 }
508 Point::new(point.0.row, column as u32)
509 }
510 }
511
512 #[inline(always)]
513 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
514 let max_point = self.lines();
515 if point.row > max_point.row {
516 return max_point;
517 }
518
519 let line = self.slice(self.offset_range_for_row(point.row));
520 if point.column == 0 {
521 point
522 } else if point.column >= line.len() as u32 {
523 Point::new(point.row, line.len() as u32)
524 } else {
525 let mut column = point.column as usize;
526 let bytes = line.text.as_bytes();
527 if bytes[column - 1] < 128 && bytes[column] < 128 {
528 return Point::new(point.row, column as u32);
529 }
530
531 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
532 loop {
533 if line.is_char_boundary(column)
534 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
535 {
536 break;
537 }
538
539 match bias {
540 Bias::Left => column -= 1,
541 Bias::Right => column += 1,
542 }
543 grapheme_cursor.set_cursor(column);
544 }
545 Point::new(point.row, column as u32)
546 }
547 }
548
549 #[inline(always)]
550 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
551 let max_point = self.lines();
552 if point.0.row > max_point.row {
553 PointUtf16::new(max_point.row, self.last_line_len_utf16())
554 } else {
555 let line = self.slice(self.offset_range_for_row(point.0.row));
556 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
557 PointUtf16::new(point.0.row, column.0 as u32)
558 }
559 }
560
561 #[inline(always)]
562 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
563 if target == OffsetUtf16::default() {
564 OffsetUtf16::default()
565 } else if target >= self.len_utf16() {
566 self.len_utf16()
567 } else {
568 let mut offset = self.offset_utf16_to_offset(target);
569 while !self.text.is_char_boundary(offset) {
570 if bias == Bias::Left {
571 offset -= 1;
572 } else {
573 offset += 1;
574 }
575 }
576 self.offset_to_offset_utf16(offset)
577 }
578 }
579
580 #[inline(always)]
581 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
582 let row_start = if row > 0 {
583 #[cfg(not(test))]
584 let newlines = self.newlines;
585 #[cfg(test)]
586 let newlines = self.newlines as u128;
587 nth_set_bit(newlines, row as usize) + 1
588 } else {
589 0
590 };
591 let row_len = if row_start == MAX_BASE {
592 0
593 } else {
594 cmp::min(
595 (self.newlines >> row_start).trailing_zeros(),
596 (self.text.len() - row_start) as u32,
597 )
598 };
599 row_start..row_start + row_len as usize
600 }
601
602 #[inline(always)]
603 pub fn tabs(&self) -> Tabs {
604 Tabs {
605 tabs: self.tabs,
606 chars: self.chars,
607 }
608 }
609}
610
611pub struct Tabs {
612 tabs: Bitmap,
613 chars: Bitmap,
614}
615
616#[derive(Debug, PartialEq, Eq)]
617pub struct TabPosition {
618 pub byte_offset: usize,
619 pub char_offset: usize,
620}
621
622impl Iterator for Tabs {
623 type Item = TabPosition;
624
625 fn next(&mut self) -> Option<Self::Item> {
626 if self.tabs == 0 {
627 return None;
628 }
629
630 let tab_offset = self.tabs.trailing_zeros() as usize;
631 let chars_mask = (1 << tab_offset) - 1;
632 let char_offset = (self.chars & chars_mask).count_ones() as usize;
633
634 // Since tabs are 1 byte the tab offset is the same as the byte offset
635 let position = TabPosition {
636 byte_offset: tab_offset,
637 char_offset,
638 };
639 // Remove the tab we've just seen
640 self.tabs ^= 1 << tab_offset;
641
642 Some(position)
643 }
644}
645
646/// Finds the n-th bit that is set to 1.
647#[inline(always)]
648fn nth_set_bit(v: u128, n: usize) -> usize {
649 let low = v as u64;
650 let high = (v >> 64) as u64;
651
652 let low_count = low.count_ones() as usize;
653 if n > low_count {
654 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
655 } else {
656 nth_set_bit_u64(low, n as u64) as usize
657 }
658}
659
660#[cold]
661#[inline(never)]
662#[track_caller]
663fn panic_char_boundary(text: &str, offset: usize) -> ! {
664 if offset > text.len() {
665 panic!(
666 "byte index {} is out of bounds of `{:?}` (length: {})",
667 offset,
668 text,
669 text.len()
670 );
671 }
672 // find the character
673 let char_start = text.floor_char_boundary(offset);
674 // `char_start` must be less than len and a char boundary
675 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
676 let char_range = char_start..char_start + ch.len_utf8();
677 panic!(
678 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
679 offset, ch, char_range,
680 );
681}
682
683#[cold]
684#[inline(never)]
685#[track_caller]
686fn log_err_char_boundary(text: &str, offset: usize) {
687 if offset > text.len() {
688 log::error!(
689 "byte index {} is out of bounds of `{:?}` (length: {})",
690 offset,
691 text,
692 text.len()
693 );
694 }
695 // find the character
696 let char_start = text.floor_char_boundary(offset);
697 // `char_start` must be less than len and a char boundary
698 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
699 let char_range = char_start..char_start + ch.len_utf8();
700 log::error!(
701 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
702 offset,
703 ch,
704 char_range,
705 );
706}
707
708#[inline(always)]
709fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
710 let v = v.reverse_bits();
711 let mut s: u64 = 64;
712
713 // Parallel bit count intermediates
714 let a = v - ((v >> 1) & (u64::MAX / 3));
715 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
716 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
717 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
718
719 // Branchless select
720 let t = (d >> 32) + (d >> 48);
721 s -= (t.wrapping_sub(n) & 256) >> 3;
722 n -= t & (t.wrapping_sub(n) >> 8);
723
724 let t = (d >> (s - 16)) & 0xff;
725 s -= (t.wrapping_sub(n) & 256) >> 4;
726 n -= t & (t.wrapping_sub(n) >> 8);
727
728 let t = (c >> (s - 8)) & 0xf;
729 s -= (t.wrapping_sub(n) & 256) >> 5;
730 n -= t & (t.wrapping_sub(n) >> 8);
731
732 let t = (b >> (s - 4)) & 0x7;
733 s -= (t.wrapping_sub(n) & 256) >> 6;
734 n -= t & (t.wrapping_sub(n) >> 8);
735
736 let t = (a >> (s - 2)) & 0x3;
737 s -= (t.wrapping_sub(n) & 256) >> 7;
738 n -= t & (t.wrapping_sub(n) >> 8);
739
740 let t = (v >> (s - 1)) & 0x1;
741 s -= (t.wrapping_sub(n) & 256) >> 8;
742
743 65 - s - 1
744}
745
746#[cfg(test)]
747mod tests {
748 use super::*;
749 use rand::prelude::*;
750 use util::RandomCharIter;
751
752 #[gpui::test(iterations = 100)]
753 fn test_random_chunks(mut rng: StdRng) {
754 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
755 log::info!("Chunk: {:?}", text);
756 let chunk = Chunk::new(&text);
757 verify_chunk(chunk.as_slice(), &text);
758
759 // Verify Chunk::chars() bitmap
760 let expected_chars = char_offsets(&text)
761 .into_iter()
762 .inspect(|i| assert!(*i < MAX_BASE))
763 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
764 assert_eq!(chunk.chars(), expected_chars);
765
766 for _ in 0..10 {
767 let mut start = rng.random_range(0..=chunk.text.len());
768 let mut end = rng.random_range(start..=chunk.text.len());
769 while !chunk.text.is_char_boundary(start) {
770 start -= 1;
771 }
772 while !chunk.text.is_char_boundary(end) {
773 end -= 1;
774 }
775 let range = start..end;
776 log::info!("Range: {:?}", range);
777 let text_slice = &text[range.clone()];
778 let chunk_slice = chunk.slice(range);
779 verify_chunk(chunk_slice, text_slice);
780 }
781 }
782
783 #[gpui::test(iterations = 100)]
784 fn test_split_chunk_slice(mut rng: StdRng) {
785 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
786 let chunk = Chunk::new(text);
787 let offset = char_offsets_with_end(text)
788 .into_iter()
789 .choose(&mut rng)
790 .unwrap();
791 let (a, b) = chunk.as_slice().split_at(offset);
792 let (a_str, b_str) = text.split_at(offset);
793 verify_chunk(a, a_str);
794 verify_chunk(b, b_str);
795 }
796
797 #[gpui::test(iterations = 1000)]
798 fn test_nth_set_bit_random(mut rng: StdRng) {
799 let set_count = rng.random_range(0..=128);
800 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
801 set_bits.sort();
802 let mut n = 0;
803 for ix in set_bits.iter().copied() {
804 n |= 1 << ix;
805 }
806
807 for (mut ix, position) in set_bits.into_iter().enumerate() {
808 ix += 1;
809 assert_eq!(
810 nth_set_bit(n, ix),
811 position,
812 "nth_set_bit({:0128b}, {})",
813 n,
814 ix
815 );
816 }
817 }
818
819 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
820 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
821 let mut str = String::new();
822 let mut chars = RandomCharIter::new(rng);
823 loop {
824 let ch = chars.next().unwrap();
825 if str.len() + ch.len_utf8() > len {
826 break;
827 }
828 str.push(ch);
829 }
830 str
831 }
832
833 #[gpui::test(iterations = 1000)]
834 fn test_append_random_strings(mut rng: StdRng) {
835 let len1 = rng.random_range(0..=MAX_BASE);
836 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
837 let str1 = random_string_with_utf8_len(&mut rng, len1);
838 let str2 = random_string_with_utf8_len(&mut rng, len2);
839 let mut chunk1 = Chunk::new(&str1);
840 let chunk2 = Chunk::new(&str2);
841 let char_offsets = char_offsets_with_end(&str2);
842 let start_index = rng.random_range(0..char_offsets.len());
843 let start_offset = char_offsets[start_index];
844 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
845 chunk1.append(chunk2.slice(start_offset..end_offset));
846 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
847 }
848
849 /// Return the byte offsets for each character in a string.
850 ///
851 /// These are valid offsets to split the string.
852 fn char_offsets(text: &str) -> Vec<usize> {
853 text.char_indices().map(|(i, _c)| i).collect()
854 }
855
856 /// Return the byte offsets for each character in a string, plus the offset
857 /// past the end of the string.
858 fn char_offsets_with_end(text: &str) -> Vec<usize> {
859 let mut v = char_offsets(text);
860 v.push(text.len());
861 v
862 }
863
864 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
865 let mut offset = 0;
866 let mut offset_utf16 = OffsetUtf16(0);
867 let mut point = Point::zero();
868 let mut point_utf16 = PointUtf16::zero();
869
870 log::info!("Verifying chunk {:?}", text);
871 assert_eq!(chunk.offset_to_point(0), Point::zero());
872
873 let mut expected_tab_positions = Vec::new();
874
875 for (char_offset, c) in text.chars().enumerate() {
876 let expected_point = chunk.offset_to_point(offset);
877 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
878 assert_eq!(
879 chunk.point_to_offset(point),
880 offset,
881 "mismatch at point {:?}",
882 point
883 );
884 assert_eq!(
885 chunk.offset_to_offset_utf16(offset),
886 offset_utf16,
887 "mismatch at offset {}",
888 offset
889 );
890 assert_eq!(
891 chunk.offset_utf16_to_offset(offset_utf16),
892 offset,
893 "mismatch at offset_utf16 {:?}",
894 offset_utf16
895 );
896 assert_eq!(
897 chunk.point_to_point_utf16(point),
898 point_utf16,
899 "mismatch at point {:?}",
900 point
901 );
902 assert_eq!(
903 chunk.point_utf16_to_offset(point_utf16, false),
904 offset,
905 "mismatch at point_utf16 {:?}",
906 point_utf16
907 );
908 assert_eq!(
909 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
910 point,
911 "mismatch for unclipped_point_utf16_to_point at {:?}",
912 point_utf16
913 );
914
915 assert_eq!(
916 chunk.clip_point(point, Bias::Left),
917 point,
918 "incorrect left clip at {:?}",
919 point
920 );
921 assert_eq!(
922 chunk.clip_point(point, Bias::Right),
923 point,
924 "incorrect right clip at {:?}",
925 point
926 );
927
928 for i in 1..c.len_utf8() {
929 let test_point = Point::new(point.row, point.column + i as u32);
930 assert_eq!(
931 chunk.clip_point(test_point, Bias::Left),
932 point,
933 "incorrect left clip within multi-byte char at {:?}",
934 test_point
935 );
936 assert_eq!(
937 chunk.clip_point(test_point, Bias::Right),
938 Point::new(point.row, point.column + c.len_utf8() as u32),
939 "incorrect right clip within multi-byte char at {:?}",
940 test_point
941 );
942 }
943
944 for i in 1..c.len_utf16() {
945 let test_point = Unclipped(PointUtf16::new(
946 point_utf16.row,
947 point_utf16.column + i as u32,
948 ));
949 assert_eq!(
950 chunk.unclipped_point_utf16_to_point(test_point),
951 point,
952 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
953 test_point
954 );
955 assert_eq!(
956 chunk.clip_point_utf16(test_point, Bias::Left),
957 point_utf16,
958 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
959 test_point
960 );
961 assert_eq!(
962 chunk.clip_point_utf16(test_point, Bias::Right),
963 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
964 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
965 test_point
966 );
967
968 let test_offset = OffsetUtf16(offset_utf16.0 + i);
969 assert_eq!(
970 chunk.clip_offset_utf16(test_offset, Bias::Left),
971 offset_utf16,
972 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
973 test_offset
974 );
975 assert_eq!(
976 chunk.clip_offset_utf16(test_offset, Bias::Right),
977 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
978 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
979 test_offset
980 );
981 }
982
983 if c == '\n' {
984 point.row += 1;
985 point.column = 0;
986 point_utf16.row += 1;
987 point_utf16.column = 0;
988 } else {
989 point.column += c.len_utf8() as u32;
990 point_utf16.column += c.len_utf16() as u32;
991 }
992
993 if c == '\t' {
994 expected_tab_positions.push(TabPosition {
995 byte_offset: offset,
996 char_offset,
997 });
998 }
999
1000 offset += c.len_utf8();
1001 offset_utf16.0 += c.len_utf16();
1002 }
1003
1004 let final_point = chunk.offset_to_point(offset);
1005 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1006 assert_eq!(
1007 chunk.point_to_offset(point),
1008 offset,
1009 "mismatch at point {:?}",
1010 point
1011 );
1012 assert_eq!(
1013 chunk.offset_to_offset_utf16(offset),
1014 offset_utf16,
1015 "mismatch at offset {}",
1016 offset
1017 );
1018 assert_eq!(
1019 chunk.offset_utf16_to_offset(offset_utf16),
1020 offset,
1021 "mismatch at offset_utf16 {:?}",
1022 offset_utf16
1023 );
1024 assert_eq!(
1025 chunk.point_to_point_utf16(point),
1026 point_utf16,
1027 "mismatch at final point {:?}",
1028 point
1029 );
1030 assert_eq!(
1031 chunk.point_utf16_to_offset(point_utf16, false),
1032 offset,
1033 "mismatch at final point_utf16 {:?}",
1034 point_utf16
1035 );
1036 assert_eq!(
1037 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1038 point,
1039 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1040 point_utf16
1041 );
1042 assert_eq!(
1043 chunk.clip_point(point, Bias::Left),
1044 point,
1045 "incorrect left clip at final point {:?}",
1046 point
1047 );
1048 assert_eq!(
1049 chunk.clip_point(point, Bias::Right),
1050 point,
1051 "incorrect right clip at final point {:?}",
1052 point
1053 );
1054 assert_eq!(
1055 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1056 point_utf16,
1057 "incorrect left clip_point_utf16 at final point {:?}",
1058 point_utf16
1059 );
1060 assert_eq!(
1061 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1062 point_utf16,
1063 "incorrect right clip_point_utf16 at final point {:?}",
1064 point_utf16
1065 );
1066 assert_eq!(
1067 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1068 offset_utf16,
1069 "incorrect left clip_offset_utf16 at final offset {:?}",
1070 offset_utf16
1071 );
1072 assert_eq!(
1073 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1074 offset_utf16,
1075 "incorrect right clip_offset_utf16 at final offset {:?}",
1076 offset_utf16
1077 );
1078
1079 // Verify length methods
1080 assert_eq!(chunk.len(), text.len());
1081 assert_eq!(
1082 chunk.len_utf16().0,
1083 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1084 );
1085
1086 // Verify line counting
1087 let lines = chunk.lines();
1088 let mut newline_count = 0;
1089 let mut last_line_len = 0;
1090 for c in text.chars() {
1091 if c == '\n' {
1092 newline_count += 1;
1093 last_line_len = 0;
1094 } else {
1095 last_line_len += c.len_utf8() as u32;
1096 }
1097 }
1098 assert_eq!(lines, Point::new(newline_count, last_line_len));
1099
1100 // Verify first/last line chars
1101 if !text.is_empty() {
1102 let first_line = text.split('\n').next().unwrap();
1103 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1104
1105 let last_line = text.split('\n').next_back().unwrap();
1106 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1107 assert_eq!(
1108 chunk.last_line_len_utf16(),
1109 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1110 );
1111 }
1112
1113 // Verify longest row
1114 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1115 let mut max_chars = 0;
1116 let mut current_row = 0;
1117 let mut current_chars = 0;
1118 let mut max_row = 0;
1119
1120 for c in text.chars() {
1121 if c == '\n' {
1122 if current_chars > max_chars {
1123 max_chars = current_chars;
1124 max_row = current_row;
1125 }
1126 current_row += 1;
1127 current_chars = 0;
1128 } else {
1129 current_chars += 1;
1130 }
1131 }
1132
1133 if current_chars > max_chars {
1134 max_chars = current_chars;
1135 max_row = current_row;
1136 }
1137
1138 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1139 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1140 }
1141}