1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let mut this = Chunk::default();
51 this.push_str(text);
52 this
53 }
54
55 #[inline(always)]
56 pub fn push_str(&mut self, text: &str) {
57 for (char_ix, c) in text.char_indices() {
58 let ix = self.text.len() + char_ix;
59 self.chars |= 1 << ix;
60 self.chars_utf16 |= 1 << ix;
61 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
62 self.newlines |= ((c == '\n') as Bitmap) << ix;
63 self.tabs |= ((c == '\t') as Bitmap) << ix;
64 }
65 self.text.push_str(text);
66 }
67
68 #[inline(always)]
69 pub fn append(&mut self, slice: ChunkSlice) {
70 if slice.is_empty() {
71 return;
72 };
73
74 let base_ix = self.text.len();
75 self.chars |= slice.chars << base_ix;
76 self.chars_utf16 |= slice.chars_utf16 << base_ix;
77 self.newlines |= slice.newlines << base_ix;
78 self.tabs |= slice.tabs << base_ix;
79 self.text.push_str(slice.text);
80 }
81
82 #[inline(always)]
83 pub fn as_slice(&self) -> ChunkSlice<'_> {
84 ChunkSlice {
85 chars: self.chars,
86 chars_utf16: self.chars_utf16,
87 newlines: self.newlines,
88 tabs: self.tabs,
89 text: &self.text,
90 }
91 }
92
93 #[inline(always)]
94 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
95 self.as_slice().slice(range)
96 }
97
98 #[inline(always)]
99 pub fn chars(&self) -> Bitmap {
100 self.chars
101 }
102
103 pub fn tabs(&self) -> Bitmap {
104 self.tabs
105 }
106
107 #[inline(always)]
108 pub fn is_char_boundary(&self, offset: usize) -> bool {
109 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
110 }
111
112 pub fn floor_char_boundary(&self, index: usize) -> usize {
113 if index >= self.text.len() {
114 self.text.len()
115 } else {
116 let mut i = index;
117 while i > 0 {
118 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
119 break;
120 }
121 i -= 1;
122 }
123
124 i
125 }
126 }
127
128 #[track_caller]
129 #[inline(always)]
130 pub fn assert_char_boundary(&self, offset: usize) {
131 if self.is_char_boundary(offset) {
132 return;
133 }
134 panic_char_boundary(self, offset);
135
136 #[cold]
137 #[inline(never)]
138 #[track_caller]
139 fn panic_char_boundary(chunk: &Chunk, offset: usize) {
140 if offset > chunk.text.len() {
141 panic!(
142 "byte index {} is out of bounds of `{:?}` (length: {})",
143 offset,
144 chunk.text,
145 chunk.text.len()
146 );
147 }
148 // find the character
149 let char_start = chunk.floor_char_boundary(offset);
150 // `char_start` must be less than len and a char boundary
151 let ch = chunk
152 .text
153 .get(char_start..)
154 .unwrap()
155 .chars()
156 .next()
157 .unwrap();
158 let char_range = char_start..char_start + ch.len_utf8();
159 panic!(
160 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
161 offset, ch, char_range,
162 );
163 }
164 }
165}
166
167#[derive(Clone, Copy, Debug)]
168pub struct ChunkSlice<'a> {
169 chars: Bitmap,
170 chars_utf16: Bitmap,
171 newlines: Bitmap,
172 tabs: Bitmap,
173 text: &'a str,
174}
175
176impl Into<Chunk> for ChunkSlice<'_> {
177 fn into(self) -> Chunk {
178 Chunk {
179 chars: self.chars,
180 chars_utf16: self.chars_utf16,
181 newlines: self.newlines,
182 tabs: self.tabs,
183 text: self.text.try_into().unwrap(),
184 }
185 }
186}
187
188impl<'a> ChunkSlice<'a> {
189 #[inline(always)]
190 pub fn is_empty(&self) -> bool {
191 self.text.is_empty()
192 }
193
194 #[inline(always)]
195 pub fn is_char_boundary(&self, offset: usize) -> bool {
196 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
197 }
198
199 #[inline(always)]
200 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
201 if mid == MAX_BASE {
202 let left = self;
203 let right = ChunkSlice {
204 chars: 0,
205 chars_utf16: 0,
206 newlines: 0,
207 tabs: 0,
208 text: "",
209 };
210 (left, right)
211 } else {
212 let mask = ((1 as Bitmap) << mid) - 1;
213 let (left_text, right_text) = self.text.split_at(mid);
214 let left = ChunkSlice {
215 chars: self.chars & mask,
216 chars_utf16: self.chars_utf16 & mask,
217 newlines: self.newlines & mask,
218 tabs: self.tabs & mask,
219 text: left_text,
220 };
221 let right = ChunkSlice {
222 chars: self.chars >> mid,
223 chars_utf16: self.chars_utf16 >> mid,
224 newlines: self.newlines >> mid,
225 tabs: self.tabs >> mid,
226 text: right_text,
227 };
228 (left, right)
229 }
230 }
231
232 #[inline(always)]
233 pub fn slice(self, range: Range<usize>) -> Self {
234 let mask = (1 as Bitmap)
235 .unbounded_shl(range.end as u32)
236 .wrapping_sub(1);
237 if range.start == MAX_BASE {
238 Self {
239 chars: 0,
240 chars_utf16: 0,
241 newlines: 0,
242 tabs: 0,
243 text: "",
244 }
245 } else {
246 self.assert_char_boundary(range.start);
247 self.assert_char_boundary(range.end);
248 Self {
249 chars: (self.chars & mask) >> range.start,
250 chars_utf16: (self.chars_utf16 & mask) >> range.start,
251 newlines: (self.newlines & mask) >> range.start,
252 tabs: (self.tabs & mask) >> range.start,
253 text: &self.text[range],
254 }
255 }
256 }
257
258 #[inline(always)]
259 pub fn text_summary(&self) -> TextSummary {
260 let mut chars = 0;
261 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
262 TextSummary {
263 len: self.len(),
264 chars,
265 len_utf16: self.len_utf16(),
266 lines: self.lines(),
267 first_line_chars: self.first_line_chars(),
268 last_line_chars: self.last_line_chars(),
269 last_line_len_utf16: self.last_line_len_utf16(),
270 longest_row,
271 longest_row_chars,
272 }
273 }
274
275 /// Get length in bytes
276 #[inline(always)]
277 pub fn len(&self) -> usize {
278 self.text.len()
279 }
280
281 /// Get length in UTF-16 code units
282 #[inline(always)]
283 pub fn len_utf16(&self) -> OffsetUtf16 {
284 OffsetUtf16(self.chars_utf16.count_ones() as usize)
285 }
286
287 /// Get point representing number of lines and length of last line
288 #[inline(always)]
289 pub fn lines(&self) -> Point {
290 let row = self.newlines.count_ones();
291 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
292 Point::new(row, column)
293 }
294
295 /// Get number of chars in first line
296 #[inline(always)]
297 pub fn first_line_chars(&self) -> u32 {
298 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
299 }
300
301 /// Get number of chars in last line
302 #[inline(always)]
303 pub fn last_line_chars(&self) -> u32 {
304 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
305 }
306
307 /// Get number of UTF-16 code units in last line
308 #[inline(always)]
309 pub fn last_line_len_utf16(&self) -> u32 {
310 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
311 }
312
313 /// Get the longest row in the chunk and its length in characters.
314 /// Calculate the total number of characters in the chunk along the way.
315 #[inline(always)]
316 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
317 let mut chars = self.chars;
318 let mut newlines = self.newlines;
319 *total_chars = 0;
320 let mut row = 0;
321 let mut longest_row = 0;
322 let mut longest_row_chars = 0;
323 while newlines > 0 {
324 let newline_ix = newlines.trailing_zeros();
325 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
326 *total_chars += usize::from(row_chars);
327 if row_chars > longest_row_chars {
328 longest_row = row;
329 longest_row_chars = row_chars;
330 }
331
332 newlines >>= newline_ix;
333 newlines >>= 1;
334 chars >>= newline_ix;
335 chars >>= 1;
336 row += 1;
337 *total_chars += 1;
338 }
339
340 let row_chars = chars.count_ones() as u8;
341 *total_chars += usize::from(row_chars);
342 if row_chars > longest_row_chars {
343 (row, row_chars as u32)
344 } else {
345 (longest_row, longest_row_chars as u32)
346 }
347 }
348
349 #[inline(always)]
350 pub fn offset_to_point(&self, offset: usize) -> Point {
351 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
352 let row = (self.newlines & mask).count_ones();
353 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
354 let column = (offset - newline_ix as usize) as u32;
355 Point::new(row, column)
356 }
357
358 #[inline(always)]
359 pub fn point_to_offset(&self, point: Point) -> usize {
360 if point.row > self.lines().row {
361 debug_panic!(
362 "point {:?} extends beyond rows for string {:?}",
363 point,
364 self.text
365 );
366 return self.len();
367 }
368
369 let row_offset_range = self.offset_range_for_row(point.row);
370 if point.column > row_offset_range.len() as u32 {
371 debug_panic!(
372 "point {:?} extends beyond row for string {:?}",
373 point,
374 self.text
375 );
376 row_offset_range.end
377 } else {
378 row_offset_range.start + point.column as usize
379 }
380 }
381
382 #[track_caller]
383 #[inline(always)]
384 pub fn assert_char_boundary(&self, offset: usize) {
385 if self.is_char_boundary(offset) {
386 return;
387 }
388 panic_char_boundary(self, offset);
389
390 #[cold]
391 #[inline(never)]
392 fn panic_char_boundary(chunk: &ChunkSlice, offset: usize) {
393 if offset > chunk.text.len() {
394 panic!(
395 "byte index {} is out of bounds of `{:?}` (length: {})",
396 offset,
397 chunk.text,
398 chunk.text.len()
399 );
400 }
401 // find the character
402 let char_start = chunk.floor_char_boundary(offset);
403 // `char_start` must be less than len and a char boundary
404 let ch = chunk
405 .text
406 .get(char_start..)
407 .unwrap()
408 .chars()
409 .next()
410 .unwrap();
411 let char_range = char_start..char_start + ch.len_utf8();
412 panic!(
413 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
414 offset, ch, char_range,
415 );
416 }
417 }
418
419 pub fn floor_char_boundary(&self, index: usize) -> usize {
420 self.text.floor_char_boundary(index)
421 }
422
423 #[inline(always)]
424 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
425 if point.row > self.lines().row {
426 debug_panic!(
427 "point {:?} extends beyond rows for string {:?}",
428 point,
429 self.text
430 );
431 return self.len_utf16();
432 }
433 self.offset_to_offset_utf16(self.point_to_offset(point))
434 }
435
436 #[inline(always)]
437 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
438 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
439 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
440 }
441
442 #[inline(always)]
443 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
444 if target.0 == 0 {
445 0
446 } else {
447 #[cfg(not(test))]
448 let chars_utf16 = self.chars_utf16;
449 #[cfg(test)]
450 let chars_utf16 = self.chars_utf16 as u128;
451 let ix = nth_set_bit(chars_utf16, target.0) + 1;
452 if ix == MAX_BASE {
453 MAX_BASE
454 } else {
455 let utf8_additional_len = cmp::min(
456 (self.chars_utf16 >> ix).trailing_zeros() as usize,
457 self.text.len() - ix,
458 );
459 ix + utf8_additional_len
460 }
461 }
462 }
463
464 #[inline(always)]
465 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
466 let mask = saturating_shl_mask(offset as u32);
467 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
468 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
469 let column = if newline_ix as usize == MAX_BASE {
470 0
471 } else {
472 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
473 };
474 PointUtf16::new(row, column)
475 }
476
477 #[inline(always)]
478 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
479 self.offset_to_point_utf16(self.point_to_offset(point))
480 }
481
482 #[inline(always)]
483 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
484 let lines = self.lines();
485 if point.row > lines.row {
486 if !clip {
487 debug_panic!(
488 "point {:?} is beyond this chunk's extent {:?}",
489 point,
490 self.text
491 );
492 }
493 return self.len();
494 }
495
496 let row_offset_range = self.offset_range_for_row(point.row);
497 let line = self.slice(row_offset_range.clone());
498 if point.column > line.last_line_len_utf16() {
499 if !clip {
500 debug_panic!(
501 "point {:?} is beyond the end of the line in chunk {:?}",
502 point,
503 self.text
504 );
505 }
506 return line.len();
507 }
508
509 let mut offset = row_offset_range.start;
510 if point.column > 0 {
511 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
512 if !self.text.is_char_boundary(offset) {
513 offset -= 1;
514 while !self.text.is_char_boundary(offset) {
515 offset -= 1;
516 }
517 if !clip {
518 debug_panic!(
519 "point {:?} is within character in chunk {:?}",
520 point,
521 self.text,
522 );
523 }
524 }
525 }
526 offset
527 }
528
529 #[inline(always)]
530 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
531 let max_point = self.lines();
532 if point.0.row > max_point.row {
533 return max_point;
534 }
535
536 let row_offset_range = self.offset_range_for_row(point.0.row);
537 let line = self.slice(row_offset_range);
538 if point.0.column == 0 {
539 Point::new(point.0.row, 0)
540 } else if point.0.column >= line.len_utf16().0 as u32 {
541 Point::new(point.0.row, line.len() as u32)
542 } else {
543 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
544 while !line.text.is_char_boundary(column) {
545 column -= 1;
546 }
547 Point::new(point.0.row, column as u32)
548 }
549 }
550
551 #[inline(always)]
552 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
553 let max_point = self.lines();
554 if point.row > max_point.row {
555 return max_point;
556 }
557
558 let line = self.slice(self.offset_range_for_row(point.row));
559 if point.column == 0 {
560 point
561 } else if point.column >= line.len() as u32 {
562 Point::new(point.row, line.len() as u32)
563 } else {
564 let mut column = point.column as usize;
565 let bytes = line.text.as_bytes();
566 if bytes[column - 1] < 128 && bytes[column] < 128 {
567 return Point::new(point.row, column as u32);
568 }
569
570 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
571 loop {
572 if line.is_char_boundary(column)
573 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
574 {
575 break;
576 }
577
578 match bias {
579 Bias::Left => column -= 1,
580 Bias::Right => column += 1,
581 }
582 grapheme_cursor.set_cursor(column);
583 }
584 Point::new(point.row, column as u32)
585 }
586 }
587
588 #[inline(always)]
589 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
590 let max_point = self.lines();
591 if point.0.row > max_point.row {
592 PointUtf16::new(max_point.row, self.last_line_len_utf16())
593 } else {
594 let line = self.slice(self.offset_range_for_row(point.0.row));
595 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
596 PointUtf16::new(point.0.row, column.0 as u32)
597 }
598 }
599
600 #[inline(always)]
601 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
602 if target == OffsetUtf16::default() {
603 OffsetUtf16::default()
604 } else if target >= self.len_utf16() {
605 self.len_utf16()
606 } else {
607 let mut offset = self.offset_utf16_to_offset(target);
608 while !self.text.is_char_boundary(offset) {
609 if bias == Bias::Left {
610 offset -= 1;
611 } else {
612 offset += 1;
613 }
614 }
615 self.offset_to_offset_utf16(offset)
616 }
617 }
618
619 #[inline(always)]
620 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
621 let row_start = if row > 0 {
622 #[cfg(not(test))]
623 let newlines = self.newlines;
624 #[cfg(test)]
625 let newlines = self.newlines as u128;
626 nth_set_bit(newlines, row as usize) + 1
627 } else {
628 0
629 };
630 let row_len = if row_start == MAX_BASE {
631 0
632 } else {
633 cmp::min(
634 (self.newlines >> row_start).trailing_zeros(),
635 (self.text.len() - row_start) as u32,
636 )
637 };
638 row_start..row_start + row_len as usize
639 }
640
641 #[inline(always)]
642 pub fn tabs(&self) -> Tabs {
643 Tabs {
644 tabs: self.tabs,
645 chars: self.chars,
646 }
647 }
648}
649
650pub struct Tabs {
651 tabs: Bitmap,
652 chars: Bitmap,
653}
654
655#[derive(Debug, PartialEq, Eq)]
656pub struct TabPosition {
657 pub byte_offset: usize,
658 pub char_offset: usize,
659}
660
661impl Iterator for Tabs {
662 type Item = TabPosition;
663
664 fn next(&mut self) -> Option<Self::Item> {
665 if self.tabs == 0 {
666 return None;
667 }
668
669 let tab_offset = self.tabs.trailing_zeros() as usize;
670 let chars_mask = (1 << tab_offset) - 1;
671 let char_offset = (self.chars & chars_mask).count_ones() as usize;
672
673 // Since tabs are 1 byte the tab offset is the same as the byte offset
674 let position = TabPosition {
675 byte_offset: tab_offset,
676 char_offset,
677 };
678 // Remove the tab we've just seen
679 self.tabs ^= 1 << tab_offset;
680
681 Some(position)
682 }
683}
684
685/// Finds the n-th bit that is set to 1.
686#[inline(always)]
687fn nth_set_bit(v: u128, n: usize) -> usize {
688 let low = v as u64;
689 let high = (v >> 64) as u64;
690
691 let low_count = low.count_ones() as usize;
692 if n > low_count {
693 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
694 } else {
695 nth_set_bit_u64(low, n as u64) as usize
696 }
697}
698
699#[inline(always)]
700fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
701 let v = v.reverse_bits();
702 let mut s: u64 = 64;
703
704 // Parallel bit count intermediates
705 let a = v - ((v >> 1) & (u64::MAX / 3));
706 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
707 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
708 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
709
710 // Branchless select
711 let t = (d >> 32) + (d >> 48);
712 s -= (t.wrapping_sub(n) & 256) >> 3;
713 n -= t & (t.wrapping_sub(n) >> 8);
714
715 let t = (d >> (s - 16)) & 0xff;
716 s -= (t.wrapping_sub(n) & 256) >> 4;
717 n -= t & (t.wrapping_sub(n) >> 8);
718
719 let t = (c >> (s - 8)) & 0xf;
720 s -= (t.wrapping_sub(n) & 256) >> 5;
721 n -= t & (t.wrapping_sub(n) >> 8);
722
723 let t = (b >> (s - 4)) & 0x7;
724 s -= (t.wrapping_sub(n) & 256) >> 6;
725 n -= t & (t.wrapping_sub(n) >> 8);
726
727 let t = (a >> (s - 2)) & 0x3;
728 s -= (t.wrapping_sub(n) & 256) >> 7;
729 n -= t & (t.wrapping_sub(n) >> 8);
730
731 let t = (v >> (s - 1)) & 0x1;
732 s -= (t.wrapping_sub(n) & 256) >> 8;
733
734 65 - s - 1
735}
736
737#[cfg(test)]
738mod tests {
739 use super::*;
740 use rand::prelude::*;
741 use util::RandomCharIter;
742
743 #[gpui::test(iterations = 100)]
744 fn test_random_chunks(mut rng: StdRng) {
745 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
746 log::info!("Chunk: {:?}", text);
747 let chunk = Chunk::new(&text);
748 verify_chunk(chunk.as_slice(), &text);
749
750 // Verify Chunk::chars() bitmap
751 let expected_chars = char_offsets(&text)
752 .into_iter()
753 .inspect(|i| assert!(*i < MAX_BASE))
754 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
755 assert_eq!(chunk.chars(), expected_chars);
756
757 for _ in 0..10 {
758 let mut start = rng.random_range(0..=chunk.text.len());
759 let mut end = rng.random_range(start..=chunk.text.len());
760 while !chunk.text.is_char_boundary(start) {
761 start -= 1;
762 }
763 while !chunk.text.is_char_boundary(end) {
764 end -= 1;
765 }
766 let range = start..end;
767 log::info!("Range: {:?}", range);
768 let text_slice = &text[range.clone()];
769 let chunk_slice = chunk.slice(range);
770 verify_chunk(chunk_slice, text_slice);
771 }
772 }
773
774 #[gpui::test(iterations = 100)]
775 fn test_split_chunk_slice(mut rng: StdRng) {
776 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
777 let chunk = Chunk::new(text);
778 let offset = char_offsets_with_end(text)
779 .into_iter()
780 .choose(&mut rng)
781 .unwrap();
782 let (a, b) = chunk.as_slice().split_at(offset);
783 let (a_str, b_str) = text.split_at(offset);
784 verify_chunk(a, a_str);
785 verify_chunk(b, b_str);
786 }
787
788 #[gpui::test(iterations = 1000)]
789 fn test_nth_set_bit_random(mut rng: StdRng) {
790 let set_count = rng.random_range(0..=128);
791 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
792 set_bits.sort();
793 let mut n = 0;
794 for ix in set_bits.iter().copied() {
795 n |= 1 << ix;
796 }
797
798 for (mut ix, position) in set_bits.into_iter().enumerate() {
799 ix += 1;
800 assert_eq!(
801 nth_set_bit(n, ix),
802 position,
803 "nth_set_bit({:0128b}, {})",
804 n,
805 ix
806 );
807 }
808 }
809
810 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
811 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
812 let mut str = String::new();
813 let mut chars = RandomCharIter::new(rng);
814 loop {
815 let ch = chars.next().unwrap();
816 if str.len() + ch.len_utf8() > len {
817 break;
818 }
819 str.push(ch);
820 }
821 str
822 }
823
824 #[gpui::test(iterations = 1000)]
825 fn test_append_random_strings(mut rng: StdRng) {
826 let len1 = rng.random_range(0..=MAX_BASE);
827 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
828 let str1 = random_string_with_utf8_len(&mut rng, len1);
829 let str2 = random_string_with_utf8_len(&mut rng, len2);
830 let mut chunk1 = Chunk::new(&str1);
831 let chunk2 = Chunk::new(&str2);
832 let char_offsets = char_offsets_with_end(&str2);
833 let start_index = rng.random_range(0..char_offsets.len());
834 let start_offset = char_offsets[start_index];
835 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
836 chunk1.append(chunk2.slice(start_offset..end_offset));
837 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
838 }
839
840 /// Return the byte offsets for each character in a string.
841 ///
842 /// These are valid offsets to split the string.
843 fn char_offsets(text: &str) -> Vec<usize> {
844 text.char_indices().map(|(i, _c)| i).collect()
845 }
846
847 /// Return the byte offsets for each character in a string, plus the offset
848 /// past the end of the string.
849 fn char_offsets_with_end(text: &str) -> Vec<usize> {
850 let mut v = char_offsets(text);
851 v.push(text.len());
852 v
853 }
854
855 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
856 let mut offset = 0;
857 let mut offset_utf16 = OffsetUtf16(0);
858 let mut point = Point::zero();
859 let mut point_utf16 = PointUtf16::zero();
860
861 log::info!("Verifying chunk {:?}", text);
862 assert_eq!(chunk.offset_to_point(0), Point::zero());
863
864 let mut expected_tab_positions = Vec::new();
865
866 for (char_offset, c) in text.chars().enumerate() {
867 let expected_point = chunk.offset_to_point(offset);
868 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
869 assert_eq!(
870 chunk.point_to_offset(point),
871 offset,
872 "mismatch at point {:?}",
873 point
874 );
875 assert_eq!(
876 chunk.offset_to_offset_utf16(offset),
877 offset_utf16,
878 "mismatch at offset {}",
879 offset
880 );
881 assert_eq!(
882 chunk.offset_utf16_to_offset(offset_utf16),
883 offset,
884 "mismatch at offset_utf16 {:?}",
885 offset_utf16
886 );
887 assert_eq!(
888 chunk.point_to_point_utf16(point),
889 point_utf16,
890 "mismatch at point {:?}",
891 point
892 );
893 assert_eq!(
894 chunk.point_utf16_to_offset(point_utf16, false),
895 offset,
896 "mismatch at point_utf16 {:?}",
897 point_utf16
898 );
899 assert_eq!(
900 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
901 point,
902 "mismatch for unclipped_point_utf16_to_point at {:?}",
903 point_utf16
904 );
905
906 assert_eq!(
907 chunk.clip_point(point, Bias::Left),
908 point,
909 "incorrect left clip at {:?}",
910 point
911 );
912 assert_eq!(
913 chunk.clip_point(point, Bias::Right),
914 point,
915 "incorrect right clip at {:?}",
916 point
917 );
918
919 for i in 1..c.len_utf8() {
920 let test_point = Point::new(point.row, point.column + i as u32);
921 assert_eq!(
922 chunk.clip_point(test_point, Bias::Left),
923 point,
924 "incorrect left clip within multi-byte char at {:?}",
925 test_point
926 );
927 assert_eq!(
928 chunk.clip_point(test_point, Bias::Right),
929 Point::new(point.row, point.column + c.len_utf8() as u32),
930 "incorrect right clip within multi-byte char at {:?}",
931 test_point
932 );
933 }
934
935 for i in 1..c.len_utf16() {
936 let test_point = Unclipped(PointUtf16::new(
937 point_utf16.row,
938 point_utf16.column + i as u32,
939 ));
940 assert_eq!(
941 chunk.unclipped_point_utf16_to_point(test_point),
942 point,
943 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
944 test_point
945 );
946 assert_eq!(
947 chunk.clip_point_utf16(test_point, Bias::Left),
948 point_utf16,
949 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
950 test_point
951 );
952 assert_eq!(
953 chunk.clip_point_utf16(test_point, Bias::Right),
954 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
955 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
956 test_point
957 );
958
959 let test_offset = OffsetUtf16(offset_utf16.0 + i);
960 assert_eq!(
961 chunk.clip_offset_utf16(test_offset, Bias::Left),
962 offset_utf16,
963 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
964 test_offset
965 );
966 assert_eq!(
967 chunk.clip_offset_utf16(test_offset, Bias::Right),
968 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
969 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
970 test_offset
971 );
972 }
973
974 if c == '\n' {
975 point.row += 1;
976 point.column = 0;
977 point_utf16.row += 1;
978 point_utf16.column = 0;
979 } else {
980 point.column += c.len_utf8() as u32;
981 point_utf16.column += c.len_utf16() as u32;
982 }
983
984 if c == '\t' {
985 expected_tab_positions.push(TabPosition {
986 byte_offset: offset,
987 char_offset,
988 });
989 }
990
991 offset += c.len_utf8();
992 offset_utf16.0 += c.len_utf16();
993 }
994
995 let final_point = chunk.offset_to_point(offset);
996 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
997 assert_eq!(
998 chunk.point_to_offset(point),
999 offset,
1000 "mismatch at point {:?}",
1001 point
1002 );
1003 assert_eq!(
1004 chunk.offset_to_offset_utf16(offset),
1005 offset_utf16,
1006 "mismatch at offset {}",
1007 offset
1008 );
1009 assert_eq!(
1010 chunk.offset_utf16_to_offset(offset_utf16),
1011 offset,
1012 "mismatch at offset_utf16 {:?}",
1013 offset_utf16
1014 );
1015 assert_eq!(
1016 chunk.point_to_point_utf16(point),
1017 point_utf16,
1018 "mismatch at final point {:?}",
1019 point
1020 );
1021 assert_eq!(
1022 chunk.point_utf16_to_offset(point_utf16, false),
1023 offset,
1024 "mismatch at final point_utf16 {:?}",
1025 point_utf16
1026 );
1027 assert_eq!(
1028 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1029 point,
1030 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1031 point_utf16
1032 );
1033 assert_eq!(
1034 chunk.clip_point(point, Bias::Left),
1035 point,
1036 "incorrect left clip at final point {:?}",
1037 point
1038 );
1039 assert_eq!(
1040 chunk.clip_point(point, Bias::Right),
1041 point,
1042 "incorrect right clip at final point {:?}",
1043 point
1044 );
1045 assert_eq!(
1046 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1047 point_utf16,
1048 "incorrect left clip_point_utf16 at final point {:?}",
1049 point_utf16
1050 );
1051 assert_eq!(
1052 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1053 point_utf16,
1054 "incorrect right clip_point_utf16 at final point {:?}",
1055 point_utf16
1056 );
1057 assert_eq!(
1058 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1059 offset_utf16,
1060 "incorrect left clip_offset_utf16 at final offset {:?}",
1061 offset_utf16
1062 );
1063 assert_eq!(
1064 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1065 offset_utf16,
1066 "incorrect right clip_offset_utf16 at final offset {:?}",
1067 offset_utf16
1068 );
1069
1070 // Verify length methods
1071 assert_eq!(chunk.len(), text.len());
1072 assert_eq!(
1073 chunk.len_utf16().0,
1074 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1075 );
1076
1077 // Verify line counting
1078 let lines = chunk.lines();
1079 let mut newline_count = 0;
1080 let mut last_line_len = 0;
1081 for c in text.chars() {
1082 if c == '\n' {
1083 newline_count += 1;
1084 last_line_len = 0;
1085 } else {
1086 last_line_len += c.len_utf8() as u32;
1087 }
1088 }
1089 assert_eq!(lines, Point::new(newline_count, last_line_len));
1090
1091 // Verify first/last line chars
1092 if !text.is_empty() {
1093 let first_line = text.split('\n').next().unwrap();
1094 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1095
1096 let last_line = text.split('\n').next_back().unwrap();
1097 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1098 assert_eq!(
1099 chunk.last_line_len_utf16(),
1100 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1101 );
1102 }
1103
1104 // Verify longest row
1105 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1106 let mut max_chars = 0;
1107 let mut current_row = 0;
1108 let mut current_chars = 0;
1109 let mut max_row = 0;
1110
1111 for c in text.chars() {
1112 if c == '\n' {
1113 if current_chars > max_chars {
1114 max_chars = current_chars;
1115 max_row = current_row;
1116 }
1117 current_row += 1;
1118 current_chars = 0;
1119 } else {
1120 current_chars += 1;
1121 }
1122 }
1123
1124 if current_chars > max_chars {
1125 max_chars = current_chars;
1126 max_row = current_row;
1127 }
1128
1129 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1130 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1131 }
1132}