1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let text = ArrayString::from(text).unwrap();
51
52 const CHUNK_SIZE: usize = 8;
53
54 let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
55 let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
56 let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
57 let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
58
59 let mut chunk_ix = 0;
60
61 let mut bytes = text.as_bytes();
62 while !bytes.is_empty() {
63 let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
64 bytes = rest;
65
66 let mut chars = 0;
67 let mut newlines = 0;
68 let mut tabs = 0;
69 let mut chars_utf16 = 0;
70
71 for (ix, &b) in chunk.iter().enumerate() {
72 chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
73 newlines |= ((b == b'\n') as u8) << ix;
74 tabs |= ((b == b'\t') as u8) << ix;
75 // b >= 240 when we are at the first byte of the 4 byte encoded
76 // utf-8 code point (U+010000 or greater) it means that it would
77 // be encoded as two 16-bit code units in utf-16
78 chars_utf16 |= ((b >= 240) as u8) << ix;
79 }
80
81 chars_bytes[chunk_ix] = chars;
82 newlines_bytes[chunk_ix] = newlines;
83 tabs_bytes[chunk_ix] = tabs;
84 chars_utf16_bytes[chunk_ix] = chars_utf16;
85
86 chunk_ix += 1;
87 }
88
89 let chars = Bitmap::from_le_bytes(chars_bytes);
90
91 Chunk {
92 text,
93 chars,
94 chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
95 newlines: Bitmap::from_le_bytes(newlines_bytes),
96 tabs: Bitmap::from_le_bytes(tabs_bytes),
97 }
98 }
99
100 #[inline(always)]
101 pub fn push_str(&mut self, text: &str) {
102 self.append(Chunk::new(text).as_slice());
103 }
104
105 #[inline(always)]
106 pub fn append(&mut self, slice: ChunkSlice) {
107 if slice.is_empty() {
108 return;
109 };
110
111 let base_ix = self.text.len();
112 self.chars |= slice.chars << base_ix;
113 self.chars_utf16 |= slice.chars_utf16 << base_ix;
114 self.newlines |= slice.newlines << base_ix;
115 self.tabs |= slice.tabs << base_ix;
116 self.text.push_str(slice.text);
117 }
118
119 #[inline(always)]
120 pub fn as_slice(&self) -> ChunkSlice<'_> {
121 ChunkSlice {
122 chars: self.chars,
123 chars_utf16: self.chars_utf16,
124 newlines: self.newlines,
125 tabs: self.tabs,
126 text: &self.text,
127 }
128 }
129
130 #[inline(always)]
131 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
132 self.as_slice().slice(range)
133 }
134
135 #[inline(always)]
136 pub fn chars(&self) -> Bitmap {
137 self.chars
138 }
139
140 pub fn tabs(&self) -> Bitmap {
141 self.tabs
142 }
143
144 #[inline(always)]
145 pub fn is_char_boundary(&self, offset: usize) -> bool {
146 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
147 }
148
149 pub fn floor_char_boundary(&self, index: usize) -> usize {
150 if index >= self.text.len() {
151 self.text.len()
152 } else {
153 let mut i = index;
154 while i > 0 {
155 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
156 break;
157 }
158 i -= 1;
159 }
160
161 i
162 }
163 }
164
165 #[track_caller]
166 #[inline(always)]
167 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
168 if self.is_char_boundary(offset) {
169 return true;
170 }
171 if PANIC || cfg!(debug_assertions) {
172 panic_char_boundary(&self.text, offset);
173 } else {
174 log_err_char_boundary(&self.text, offset);
175 false
176 }
177 }
178}
179
180#[derive(Clone, Copy, Debug)]
181pub struct ChunkSlice<'a> {
182 chars: Bitmap,
183 chars_utf16: Bitmap,
184 newlines: Bitmap,
185 tabs: Bitmap,
186 text: &'a str,
187}
188
189impl Into<Chunk> for ChunkSlice<'_> {
190 fn into(self) -> Chunk {
191 Chunk {
192 chars: self.chars,
193 chars_utf16: self.chars_utf16,
194 newlines: self.newlines,
195 tabs: self.tabs,
196 text: self.text.try_into().unwrap(),
197 }
198 }
199}
200
201impl<'a> ChunkSlice<'a> {
202 #[inline(always)]
203 pub fn is_empty(&self) -> bool {
204 self.text.is_empty()
205 }
206
207 #[inline(always)]
208 pub fn is_char_boundary(&self, offset: usize) -> bool {
209 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
210 }
211
212 #[inline(always)]
213 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
214 if mid == MAX_BASE {
215 let left = self;
216 let right = ChunkSlice {
217 chars: 0,
218 chars_utf16: 0,
219 newlines: 0,
220 tabs: 0,
221 text: "",
222 };
223 (left, right)
224 } else {
225 let mask = ((1 as Bitmap) << mid) - 1;
226 let (left_text, right_text) = self.text.split_at(mid);
227 let left = ChunkSlice {
228 chars: self.chars & mask,
229 chars_utf16: self.chars_utf16 & mask,
230 newlines: self.newlines & mask,
231 tabs: self.tabs & mask,
232 text: left_text,
233 };
234 let right = ChunkSlice {
235 chars: self.chars >> mid,
236 chars_utf16: self.chars_utf16 >> mid,
237 newlines: self.newlines >> mid,
238 tabs: self.tabs >> mid,
239 text: right_text,
240 };
241 (left, right)
242 }
243 }
244
245 #[inline(always)]
246 pub fn slice(self, mut range: Range<usize>) -> Self {
247 if range.start == MAX_BASE {
248 Self {
249 chars: 0,
250 chars_utf16: 0,
251 newlines: 0,
252 tabs: 0,
253 text: "",
254 }
255 } else {
256 if !self.assert_char_boundary::<false>(range.start) {
257 range.start = self.text.ceil_char_boundary(range.start);
258 }
259 if !self.assert_char_boundary::<false>(range.end) {
260 range.end = if range.end < range.start {
261 range.start
262 } else {
263 self.text.floor_char_boundary(range.end)
264 };
265 }
266 let mask = (1 as Bitmap)
267 .unbounded_shl(range.end as u32)
268 .wrapping_sub(1);
269 Self {
270 chars: (self.chars & mask) >> range.start,
271 chars_utf16: (self.chars_utf16 & mask) >> range.start,
272 newlines: (self.newlines & mask) >> range.start,
273 tabs: (self.tabs & mask) >> range.start,
274 text: &self.text[range],
275 }
276 }
277 }
278
279 #[inline(always)]
280 pub fn text_summary(&self) -> TextSummary {
281 let mut chars = 0;
282 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
283 TextSummary {
284 len: self.len(),
285 chars,
286 len_utf16: self.len_utf16(),
287 lines: self.lines(),
288 first_line_chars: self.first_line_chars(),
289 last_line_chars: self.last_line_chars(),
290 last_line_len_utf16: self.last_line_len_utf16(),
291 longest_row,
292 longest_row_chars,
293 }
294 }
295
296 /// Get length in bytes
297 #[inline(always)]
298 pub fn len(&self) -> usize {
299 self.text.len()
300 }
301
302 /// Get length in UTF-16 code units
303 #[inline(always)]
304 pub fn len_utf16(&self) -> OffsetUtf16 {
305 OffsetUtf16(self.chars_utf16.count_ones() as usize)
306 }
307
308 /// Get point representing number of lines and length of last line
309 #[inline(always)]
310 pub fn lines(&self) -> Point {
311 let row = self.newlines.count_ones();
312 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
313 Point::new(row, column)
314 }
315
316 /// Get number of chars in first line
317 #[inline(always)]
318 pub fn first_line_chars(&self) -> u32 {
319 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
320 }
321
322 /// Get number of chars in last line
323 #[inline(always)]
324 pub fn last_line_chars(&self) -> u32 {
325 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
326 }
327
328 /// Get number of UTF-16 code units in last line
329 #[inline(always)]
330 pub fn last_line_len_utf16(&self) -> u32 {
331 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
332 }
333
334 /// Get the longest row in the chunk and its length in characters.
335 /// Calculate the total number of characters in the chunk along the way.
336 #[inline(always)]
337 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
338 let mut chars = self.chars;
339 let mut newlines = self.newlines;
340 *total_chars = 0;
341 let mut row = 0;
342 let mut longest_row = 0;
343 let mut longest_row_chars = 0;
344 while newlines > 0 {
345 let newline_ix = newlines.trailing_zeros();
346 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
347 *total_chars += usize::from(row_chars);
348 if row_chars > longest_row_chars {
349 longest_row = row;
350 longest_row_chars = row_chars;
351 }
352
353 newlines >>= newline_ix;
354 newlines >>= 1;
355 chars >>= newline_ix;
356 chars >>= 1;
357 row += 1;
358 *total_chars += 1;
359 }
360
361 let row_chars = chars.count_ones() as u8;
362 *total_chars += usize::from(row_chars);
363 if row_chars > longest_row_chars {
364 (row, row_chars as u32)
365 } else {
366 (longest_row, longest_row_chars as u32)
367 }
368 }
369
370 #[inline(always)]
371 pub fn offset_to_point(&self, offset: usize) -> Point {
372 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
373 let row = (self.newlines & mask).count_ones();
374 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
375 let column = (offset - newline_ix as usize) as u32;
376 Point::new(row, column)
377 }
378
379 #[inline(always)]
380 pub fn point_to_offset(&self, point: Point) -> usize {
381 if point.row > self.lines().row {
382 debug_panic!(
383 "point {:?} extends beyond rows for string {:?}",
384 point,
385 self.text
386 );
387 return self.len();
388 }
389
390 let row_offset_range = self.offset_range_for_row(point.row);
391 if point.column > row_offset_range.len() as u32 {
392 debug_panic!(
393 "point {:?} extends beyond row for string {:?}",
394 point,
395 self.text
396 );
397 row_offset_range.end
398 } else {
399 row_offset_range.start + point.column as usize
400 }
401 }
402
403 #[track_caller]
404 #[inline(always)]
405 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
406 if self.is_char_boundary(offset) {
407 return true;
408 }
409 if PANIC {
410 panic_char_boundary(self.text, offset);
411 } else {
412 log_err_char_boundary(self.text, offset);
413 false
414 }
415 }
416
417 pub fn floor_char_boundary(&self, index: usize) -> usize {
418 self.text.floor_char_boundary(index)
419 }
420
421 #[inline(always)]
422 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
423 if point.row > self.lines().row {
424 debug_panic!(
425 "point {:?} extends beyond rows for string {:?}",
426 point,
427 self.text
428 );
429 return self.len_utf16();
430 }
431 self.offset_to_offset_utf16(self.point_to_offset(point))
432 }
433
434 #[inline(always)]
435 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
436 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
437 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
438 }
439
440 #[inline(always)]
441 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
442 if target.0 == 0 {
443 0
444 } else {
445 #[cfg(not(test))]
446 let chars_utf16 = self.chars_utf16;
447 #[cfg(test)]
448 let chars_utf16 = self.chars_utf16 as u128;
449 let ix = nth_set_bit(chars_utf16, target.0) + 1;
450 if ix == MAX_BASE {
451 MAX_BASE
452 } else {
453 let utf8_additional_len = cmp::min(
454 (self.chars_utf16 >> ix).trailing_zeros() as usize,
455 self.text.len() - ix,
456 );
457 ix + utf8_additional_len
458 }
459 }
460 }
461
462 #[inline(always)]
463 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
464 let mask = saturating_shl_mask(offset as u32);
465 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
466 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
467 let column = if newline_ix as usize == MAX_BASE {
468 0
469 } else {
470 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
471 };
472 PointUtf16::new(row, column)
473 }
474
475 #[inline(always)]
476 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
477 self.offset_to_point_utf16(self.point_to_offset(point))
478 }
479
480 #[inline(always)]
481 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
482 let lines = self.lines();
483 if point.row > lines.row {
484 if !clip {
485 debug_panic!(
486 "point {:?} is beyond this chunk's extent {:?}",
487 point,
488 self.text
489 );
490 }
491 return self.len();
492 }
493
494 let row_offset_range = self.offset_range_for_row(point.row);
495 let line = self.slice(row_offset_range.clone());
496 if point.column > line.last_line_len_utf16() {
497 if !clip {
498 debug_panic!(
499 "point {:?} is beyond the end of the line in chunk {:?}",
500 point,
501 self.text
502 );
503 }
504 return line.len();
505 }
506
507 let mut offset = row_offset_range.start;
508 if point.column > 0 {
509 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
510 if !self.text.is_char_boundary(offset) {
511 offset -= 1;
512 while !self.text.is_char_boundary(offset) {
513 offset -= 1;
514 }
515 if !clip {
516 debug_panic!(
517 "point {:?} is within character in chunk {:?}",
518 point,
519 self.text,
520 );
521 }
522 }
523 }
524 offset
525 }
526
527 #[inline(always)]
528 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
529 let max_point = self.lines();
530 if point.0.row > max_point.row {
531 return max_point;
532 }
533
534 let row_offset_range = self.offset_range_for_row(point.0.row);
535 let line = self.slice(row_offset_range);
536 if point.0.column == 0 {
537 Point::new(point.0.row, 0)
538 } else if point.0.column >= line.len_utf16().0 as u32 {
539 Point::new(point.0.row, line.len() as u32)
540 } else {
541 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
542 while !line.text.is_char_boundary(column) {
543 column -= 1;
544 }
545 Point::new(point.0.row, column as u32)
546 }
547 }
548
549 #[inline(always)]
550 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
551 let max_point = self.lines();
552 if point.row > max_point.row {
553 return max_point;
554 }
555
556 let line = self.slice(self.offset_range_for_row(point.row));
557 if point.column == 0 {
558 point
559 } else if point.column >= line.len() as u32 {
560 Point::new(point.row, line.len() as u32)
561 } else {
562 let mut column = point.column as usize;
563 let bytes = line.text.as_bytes();
564 if bytes[column - 1] < 128 && bytes[column] < 128 {
565 return Point::new(point.row, column as u32);
566 }
567
568 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
569 loop {
570 if line.is_char_boundary(column)
571 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
572 {
573 break;
574 }
575
576 match bias {
577 Bias::Left => column -= 1,
578 Bias::Right => column += 1,
579 }
580 grapheme_cursor.set_cursor(column);
581 }
582 Point::new(point.row, column as u32)
583 }
584 }
585
586 #[inline(always)]
587 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
588 let max_point = self.lines();
589 if point.0.row > max_point.row {
590 PointUtf16::new(max_point.row, self.last_line_len_utf16())
591 } else {
592 let line = self.slice(self.offset_range_for_row(point.0.row));
593 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
594 PointUtf16::new(point.0.row, column.0 as u32)
595 }
596 }
597
598 #[inline(always)]
599 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
600 if target == OffsetUtf16::default() {
601 OffsetUtf16::default()
602 } else if target >= self.len_utf16() {
603 self.len_utf16()
604 } else {
605 let mut offset = self.offset_utf16_to_offset(target);
606 while !self.text.is_char_boundary(offset) {
607 if bias == Bias::Left {
608 offset -= 1;
609 } else {
610 offset += 1;
611 }
612 }
613 self.offset_to_offset_utf16(offset)
614 }
615 }
616
617 #[inline(always)]
618 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
619 let row_start = if row > 0 {
620 #[cfg(not(test))]
621 let newlines = self.newlines;
622 #[cfg(test)]
623 let newlines = self.newlines as u128;
624 nth_set_bit(newlines, row as usize) + 1
625 } else {
626 0
627 };
628 let row_len = if row_start == MAX_BASE {
629 0
630 } else {
631 cmp::min(
632 (self.newlines >> row_start).trailing_zeros(),
633 (self.text.len() - row_start) as u32,
634 )
635 };
636 row_start..row_start + row_len as usize
637 }
638
639 #[inline(always)]
640 pub fn tabs(&self) -> Tabs {
641 Tabs {
642 tabs: self.tabs,
643 chars: self.chars,
644 }
645 }
646}
647
648pub struct Tabs {
649 tabs: Bitmap,
650 chars: Bitmap,
651}
652
653#[derive(Debug, PartialEq, Eq)]
654pub struct TabPosition {
655 pub byte_offset: usize,
656 pub char_offset: usize,
657}
658
659impl Iterator for Tabs {
660 type Item = TabPosition;
661
662 fn next(&mut self) -> Option<Self::Item> {
663 if self.tabs == 0 {
664 return None;
665 }
666
667 let tab_offset = self.tabs.trailing_zeros() as usize;
668 let chars_mask = (1 << tab_offset) - 1;
669 let char_offset = (self.chars & chars_mask).count_ones() as usize;
670
671 // Since tabs are 1 byte the tab offset is the same as the byte offset
672 let position = TabPosition {
673 byte_offset: tab_offset,
674 char_offset,
675 };
676 // Remove the tab we've just seen
677 self.tabs ^= 1 << tab_offset;
678
679 Some(position)
680 }
681}
682
683/// Finds the n-th bit that is set to 1.
684#[inline(always)]
685fn nth_set_bit(v: u128, n: usize) -> usize {
686 let low = v as u64;
687 let high = (v >> 64) as u64;
688
689 let low_count = low.count_ones() as usize;
690 if n > low_count {
691 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
692 } else {
693 nth_set_bit_u64(low, n as u64) as usize
694 }
695}
696
697#[cold]
698#[inline(never)]
699#[track_caller]
700fn panic_char_boundary(text: &str, offset: usize) -> ! {
701 if offset > text.len() {
702 panic!(
703 "byte index {} is out of bounds of `{:?}` (length: {})",
704 offset,
705 text,
706 text.len()
707 );
708 }
709 // find the character
710 let char_start = text.floor_char_boundary(offset);
711 // `char_start` must be less than len and a char boundary
712 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
713 let char_range = char_start..char_start + ch.len_utf8();
714 panic!(
715 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
716 offset, ch, char_range,
717 );
718}
719
720#[cold]
721#[inline(never)]
722#[track_caller]
723fn log_err_char_boundary(text: &str, offset: usize) {
724 if offset >= text.len() {
725 log::error!(
726 "byte index {} is out of bounds of `{:?}` (length: {})",
727 offset,
728 text,
729 text.len()
730 );
731 return;
732 }
733 // find the character
734 let char_start = text.floor_char_boundary(offset);
735 // `char_start` must be less than len and a char boundary
736 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
737 let char_range = char_start..char_start + ch.len_utf8();
738 log::error!(
739 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
740 offset,
741 ch,
742 char_range,
743 );
744}
745
746#[inline(always)]
747fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
748 let v = v.reverse_bits();
749 let mut s: u64 = 64;
750
751 // Parallel bit count intermediates
752 let a = v - ((v >> 1) & (u64::MAX / 3));
753 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
754 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
755 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
756
757 // Branchless select
758 let t = (d >> 32) + (d >> 48);
759 s -= (t.wrapping_sub(n) & 256) >> 3;
760 n -= t & (t.wrapping_sub(n) >> 8);
761
762 let t = (d >> (s - 16)) & 0xff;
763 s -= (t.wrapping_sub(n) & 256) >> 4;
764 n -= t & (t.wrapping_sub(n) >> 8);
765
766 let t = (c >> (s - 8)) & 0xf;
767 s -= (t.wrapping_sub(n) & 256) >> 5;
768 n -= t & (t.wrapping_sub(n) >> 8);
769
770 let t = (b >> (s - 4)) & 0x7;
771 s -= (t.wrapping_sub(n) & 256) >> 6;
772 n -= t & (t.wrapping_sub(n) >> 8);
773
774 let t = (a >> (s - 2)) & 0x3;
775 s -= (t.wrapping_sub(n) & 256) >> 7;
776 n -= t & (t.wrapping_sub(n) >> 8);
777
778 let t = (v >> (s - 1)) & 0x1;
779 s -= (t.wrapping_sub(n) & 256) >> 8;
780
781 65 - s - 1
782}
783
784#[cfg(test)]
785mod tests {
786 use super::*;
787 use rand::prelude::*;
788 use util::RandomCharIter;
789
790 #[gpui::test(iterations = 100)]
791 fn test_random_chunks(mut rng: StdRng) {
792 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
793 log::info!("Chunk: {:?}", text);
794 let chunk = Chunk::new(&text);
795 verify_chunk(chunk.as_slice(), &text);
796
797 // Verify Chunk::chars() bitmap
798 let expected_chars = char_offsets(&text)
799 .into_iter()
800 .inspect(|i| assert!(*i < MAX_BASE))
801 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
802 assert_eq!(chunk.chars(), expected_chars);
803
804 for _ in 0..10 {
805 let mut start = rng.random_range(0..=chunk.text.len());
806 let mut end = rng.random_range(start..=chunk.text.len());
807 while !chunk.text.is_char_boundary(start) {
808 start -= 1;
809 }
810 while !chunk.text.is_char_boundary(end) {
811 end -= 1;
812 }
813 let range = start..end;
814 log::info!("Range: {:?}", range);
815 let text_slice = &text[range.clone()];
816 let chunk_slice = chunk.slice(range);
817 verify_chunk(chunk_slice, text_slice);
818 }
819 }
820
821 #[gpui::test(iterations = 100)]
822 fn test_split_chunk_slice(mut rng: StdRng) {
823 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
824 let chunk = Chunk::new(text);
825 let offset = char_offsets_with_end(text)
826 .into_iter()
827 .choose(&mut rng)
828 .unwrap();
829 let (a, b) = chunk.as_slice().split_at(offset);
830 let (a_str, b_str) = text.split_at(offset);
831 verify_chunk(a, a_str);
832 verify_chunk(b, b_str);
833 }
834
835 #[gpui::test(iterations = 1000)]
836 fn test_nth_set_bit_random(mut rng: StdRng) {
837 let set_count = rng.random_range(0..=128);
838 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
839 set_bits.sort();
840 let mut n = 0;
841 for ix in set_bits.iter().copied() {
842 n |= 1 << ix;
843 }
844
845 for (mut ix, position) in set_bits.into_iter().enumerate() {
846 ix += 1;
847 assert_eq!(
848 nth_set_bit(n, ix),
849 position,
850 "nth_set_bit({:0128b}, {})",
851 n,
852 ix
853 );
854 }
855 }
856
857 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
858 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
859 let mut str = String::new();
860 let mut chars = RandomCharIter::new(rng);
861 loop {
862 let ch = chars.next().unwrap();
863 if str.len() + ch.len_utf8() > len {
864 break;
865 }
866 str.push(ch);
867 }
868 str
869 }
870
871 #[gpui::test(iterations = 1000)]
872 fn test_append_random_strings(mut rng: StdRng) {
873 let len1 = rng.random_range(0..=MAX_BASE);
874 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
875 let str1 = random_string_with_utf8_len(&mut rng, len1);
876 let str2 = random_string_with_utf8_len(&mut rng, len2);
877 let mut chunk1 = Chunk::new(&str1);
878 let chunk2 = Chunk::new(&str2);
879 let char_offsets = char_offsets_with_end(&str2);
880 let start_index = rng.random_range(0..char_offsets.len());
881 let start_offset = char_offsets[start_index];
882 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
883 chunk1.append(chunk2.slice(start_offset..end_offset));
884 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
885 }
886
887 /// Return the byte offsets for each character in a string.
888 ///
889 /// These are valid offsets to split the string.
890 fn char_offsets(text: &str) -> Vec<usize> {
891 text.char_indices().map(|(i, _c)| i).collect()
892 }
893
894 /// Return the byte offsets for each character in a string, plus the offset
895 /// past the end of the string.
896 fn char_offsets_with_end(text: &str) -> Vec<usize> {
897 let mut v = char_offsets(text);
898 v.push(text.len());
899 v
900 }
901
902 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
903 let mut offset = 0;
904 let mut offset_utf16 = OffsetUtf16(0);
905 let mut point = Point::zero();
906 let mut point_utf16 = PointUtf16::zero();
907
908 log::info!("Verifying chunk {:?}", text);
909 assert_eq!(chunk.offset_to_point(0), Point::zero());
910
911 let mut expected_tab_positions = Vec::new();
912
913 for (char_offset, c) in text.chars().enumerate() {
914 let expected_point = chunk.offset_to_point(offset);
915 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
916 assert_eq!(
917 chunk.point_to_offset(point),
918 offset,
919 "mismatch at point {:?}",
920 point
921 );
922 assert_eq!(
923 chunk.offset_to_offset_utf16(offset),
924 offset_utf16,
925 "mismatch at offset {}",
926 offset
927 );
928 assert_eq!(
929 chunk.offset_utf16_to_offset(offset_utf16),
930 offset,
931 "mismatch at offset_utf16 {:?}",
932 offset_utf16
933 );
934 assert_eq!(
935 chunk.point_to_point_utf16(point),
936 point_utf16,
937 "mismatch at point {:?}",
938 point
939 );
940 assert_eq!(
941 chunk.point_utf16_to_offset(point_utf16, false),
942 offset,
943 "mismatch at point_utf16 {:?}",
944 point_utf16
945 );
946 assert_eq!(
947 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
948 point,
949 "mismatch for unclipped_point_utf16_to_point at {:?}",
950 point_utf16
951 );
952
953 assert_eq!(
954 chunk.clip_point(point, Bias::Left),
955 point,
956 "incorrect left clip at {:?}",
957 point
958 );
959 assert_eq!(
960 chunk.clip_point(point, Bias::Right),
961 point,
962 "incorrect right clip at {:?}",
963 point
964 );
965
966 for i in 1..c.len_utf8() {
967 let test_point = Point::new(point.row, point.column + i as u32);
968 assert_eq!(
969 chunk.clip_point(test_point, Bias::Left),
970 point,
971 "incorrect left clip within multi-byte char at {:?}",
972 test_point
973 );
974 assert_eq!(
975 chunk.clip_point(test_point, Bias::Right),
976 Point::new(point.row, point.column + c.len_utf8() as u32),
977 "incorrect right clip within multi-byte char at {:?}",
978 test_point
979 );
980 }
981
982 for i in 1..c.len_utf16() {
983 let test_point = Unclipped(PointUtf16::new(
984 point_utf16.row,
985 point_utf16.column + i as u32,
986 ));
987 assert_eq!(
988 chunk.unclipped_point_utf16_to_point(test_point),
989 point,
990 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
991 test_point
992 );
993 assert_eq!(
994 chunk.clip_point_utf16(test_point, Bias::Left),
995 point_utf16,
996 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
997 test_point
998 );
999 assert_eq!(
1000 chunk.clip_point_utf16(test_point, Bias::Right),
1001 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1002 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1003 test_point
1004 );
1005
1006 let test_offset = OffsetUtf16(offset_utf16.0 + i);
1007 assert_eq!(
1008 chunk.clip_offset_utf16(test_offset, Bias::Left),
1009 offset_utf16,
1010 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1011 test_offset
1012 );
1013 assert_eq!(
1014 chunk.clip_offset_utf16(test_offset, Bias::Right),
1015 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1016 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1017 test_offset
1018 );
1019 }
1020
1021 if c == '\n' {
1022 point.row += 1;
1023 point.column = 0;
1024 point_utf16.row += 1;
1025 point_utf16.column = 0;
1026 } else {
1027 point.column += c.len_utf8() as u32;
1028 point_utf16.column += c.len_utf16() as u32;
1029 }
1030
1031 if c == '\t' {
1032 expected_tab_positions.push(TabPosition {
1033 byte_offset: offset,
1034 char_offset,
1035 });
1036 }
1037
1038 offset += c.len_utf8();
1039 offset_utf16.0 += c.len_utf16();
1040 }
1041
1042 let final_point = chunk.offset_to_point(offset);
1043 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1044 assert_eq!(
1045 chunk.point_to_offset(point),
1046 offset,
1047 "mismatch at point {:?}",
1048 point
1049 );
1050 assert_eq!(
1051 chunk.offset_to_offset_utf16(offset),
1052 offset_utf16,
1053 "mismatch at offset {}",
1054 offset
1055 );
1056 assert_eq!(
1057 chunk.offset_utf16_to_offset(offset_utf16),
1058 offset,
1059 "mismatch at offset_utf16 {:?}",
1060 offset_utf16
1061 );
1062 assert_eq!(
1063 chunk.point_to_point_utf16(point),
1064 point_utf16,
1065 "mismatch at final point {:?}",
1066 point
1067 );
1068 assert_eq!(
1069 chunk.point_utf16_to_offset(point_utf16, false),
1070 offset,
1071 "mismatch at final point_utf16 {:?}",
1072 point_utf16
1073 );
1074 assert_eq!(
1075 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1076 point,
1077 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1078 point_utf16
1079 );
1080 assert_eq!(
1081 chunk.clip_point(point, Bias::Left),
1082 point,
1083 "incorrect left clip at final point {:?}",
1084 point
1085 );
1086 assert_eq!(
1087 chunk.clip_point(point, Bias::Right),
1088 point,
1089 "incorrect right clip at final point {:?}",
1090 point
1091 );
1092 assert_eq!(
1093 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1094 point_utf16,
1095 "incorrect left clip_point_utf16 at final point {:?}",
1096 point_utf16
1097 );
1098 assert_eq!(
1099 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1100 point_utf16,
1101 "incorrect right clip_point_utf16 at final point {:?}",
1102 point_utf16
1103 );
1104 assert_eq!(
1105 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1106 offset_utf16,
1107 "incorrect left clip_offset_utf16 at final offset {:?}",
1108 offset_utf16
1109 );
1110 assert_eq!(
1111 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1112 offset_utf16,
1113 "incorrect right clip_offset_utf16 at final offset {:?}",
1114 offset_utf16
1115 );
1116
1117 // Verify length methods
1118 assert_eq!(chunk.len(), text.len());
1119 assert_eq!(
1120 chunk.len_utf16().0,
1121 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1122 );
1123
1124 // Verify line counting
1125 let lines = chunk.lines();
1126 let mut newline_count = 0;
1127 let mut last_line_len = 0;
1128 for c in text.chars() {
1129 if c == '\n' {
1130 newline_count += 1;
1131 last_line_len = 0;
1132 } else {
1133 last_line_len += c.len_utf8() as u32;
1134 }
1135 }
1136 assert_eq!(lines, Point::new(newline_count, last_line_len));
1137
1138 // Verify first/last line chars
1139 if !text.is_empty() {
1140 let first_line = text.split('\n').next().unwrap();
1141 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1142
1143 let last_line = text.split('\n').next_back().unwrap();
1144 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1145 assert_eq!(
1146 chunk.last_line_len_utf16(),
1147 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1148 );
1149 }
1150
1151 // Verify longest row
1152 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1153 let mut max_chars = 0;
1154 let mut current_row = 0;
1155 let mut current_chars = 0;
1156 let mut max_row = 0;
1157
1158 for c in text.chars() {
1159 if c == '\n' {
1160 if current_chars > max_chars {
1161 max_chars = current_chars;
1162 max_row = current_row;
1163 }
1164 current_row += 1;
1165 current_chars = 0;
1166 } else {
1167 current_chars += 1;
1168 }
1169 }
1170
1171 if current_chars > max_chars {
1172 max_chars = current_chars;
1173 max_row = current_row;
1174 }
1175
1176 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1177 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1178 }
1179}