1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let text = ArrayString::from(text).unwrap();
51
52 const CHUNK_SIZE: usize = 8;
53
54 let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
55 let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
56 let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
57 let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
58
59 let mut chunk_ix = 0;
60
61 let mut bytes = text.as_bytes();
62 while !bytes.is_empty() {
63 let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
64 bytes = rest;
65
66 let mut chars = 0;
67 let mut newlines = 0;
68 let mut tabs = 0;
69 let mut chars_utf16 = 0;
70
71 for (ix, &b) in chunk.iter().enumerate() {
72 chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
73 newlines |= ((b == b'\n') as u8) << ix;
74 tabs |= ((b == b'\t') as u8) << ix;
75 // b >= 240 when we are at the first byte of the 4 byte encoded
76 // utf-8 code point (U+010000 or greater) it means that it would
77 // be encoded as two 16-bit code units in utf-16
78 chars_utf16 |= ((b >= 240) as u8) << ix;
79 }
80
81 chars_bytes[chunk_ix] = chars;
82 newlines_bytes[chunk_ix] = newlines;
83 tabs_bytes[chunk_ix] = tabs;
84 chars_utf16_bytes[chunk_ix] = chars_utf16;
85
86 chunk_ix += 1;
87 }
88
89 let chars = Bitmap::from_le_bytes(chars_bytes);
90
91 Chunk {
92 text,
93 chars,
94 chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
95 newlines: Bitmap::from_le_bytes(newlines_bytes),
96 tabs: Bitmap::from_le_bytes(tabs_bytes),
97 }
98 }
99
100 #[inline(always)]
101 pub fn push_str(&mut self, text: &str) {
102 self.append(Chunk::new(text).as_slice());
103 }
104
105 #[inline(always)]
106 pub fn append(&mut self, slice: ChunkSlice) {
107 if slice.is_empty() {
108 return;
109 };
110
111 let base_ix = self.text.len();
112 self.chars |= slice.chars << base_ix;
113 self.chars_utf16 |= slice.chars_utf16 << base_ix;
114 self.newlines |= slice.newlines << base_ix;
115 self.tabs |= slice.tabs << base_ix;
116 self.text.push_str(slice.text);
117 }
118
119 #[inline(always)]
120 pub fn as_slice(&self) -> ChunkSlice<'_> {
121 ChunkSlice {
122 chars: self.chars,
123 chars_utf16: self.chars_utf16,
124 newlines: self.newlines,
125 tabs: self.tabs,
126 text: &self.text,
127 }
128 }
129
130 #[inline(always)]
131 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
132 self.as_slice().slice(range)
133 }
134
135 #[inline(always)]
136 pub fn chars(&self) -> Bitmap {
137 self.chars
138 }
139
140 #[inline(always)]
141 pub fn tabs(&self) -> Bitmap {
142 self.tabs
143 }
144
145 #[inline(always)]
146 pub fn newlines(&self) -> Bitmap {
147 self.newlines
148 }
149
150 #[inline(always)]
151 pub fn is_char_boundary(&self, offset: usize) -> bool {
152 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
153 }
154
155 pub fn floor_char_boundary(&self, index: usize) -> usize {
156 if index >= self.text.len() {
157 self.text.len()
158 } else {
159 let mut i = index;
160 while i > 0 {
161 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
162 break;
163 }
164 i -= 1;
165 }
166
167 i
168 }
169 }
170
171 #[track_caller]
172 #[inline(always)]
173 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
174 if self.is_char_boundary(offset) {
175 return true;
176 }
177 if PANIC || cfg!(debug_assertions) {
178 panic_char_boundary(&self.text, offset);
179 } else {
180 log_err_char_boundary(&self.text, offset);
181 false
182 }
183 }
184}
185
186#[derive(Clone, Copy, Debug)]
187pub struct ChunkSlice<'a> {
188 chars: Bitmap,
189 chars_utf16: Bitmap,
190 newlines: Bitmap,
191 tabs: Bitmap,
192 text: &'a str,
193}
194
195impl Into<Chunk> for ChunkSlice<'_> {
196 fn into(self) -> Chunk {
197 Chunk {
198 chars: self.chars,
199 chars_utf16: self.chars_utf16,
200 newlines: self.newlines,
201 tabs: self.tabs,
202 text: self.text.try_into().unwrap(),
203 }
204 }
205}
206
207impl<'a> ChunkSlice<'a> {
208 #[inline(always)]
209 pub fn is_empty(&self) -> bool {
210 self.text.is_empty()
211 }
212
213 #[inline(always)]
214 pub fn is_char_boundary(&self, offset: usize) -> bool {
215 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
216 }
217
218 #[inline(always)]
219 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
220 if mid == MAX_BASE {
221 let left = self;
222 let right = ChunkSlice {
223 chars: 0,
224 chars_utf16: 0,
225 newlines: 0,
226 tabs: 0,
227 text: "",
228 };
229 (left, right)
230 } else {
231 let mask = ((1 as Bitmap) << mid) - 1;
232 let (left_text, right_text) = self.text.split_at(mid);
233 let left = ChunkSlice {
234 chars: self.chars & mask,
235 chars_utf16: self.chars_utf16 & mask,
236 newlines: self.newlines & mask,
237 tabs: self.tabs & mask,
238 text: left_text,
239 };
240 let right = ChunkSlice {
241 chars: self.chars >> mid,
242 chars_utf16: self.chars_utf16 >> mid,
243 newlines: self.newlines >> mid,
244 tabs: self.tabs >> mid,
245 text: right_text,
246 };
247 (left, right)
248 }
249 }
250
251 #[inline(always)]
252 pub fn slice(self, mut range: Range<usize>) -> Self {
253 if range.start == MAX_BASE {
254 Self {
255 chars: 0,
256 chars_utf16: 0,
257 newlines: 0,
258 tabs: 0,
259 text: "",
260 }
261 } else {
262 if !self.assert_char_boundary::<false>(range.start) {
263 range.start = self.text.ceil_char_boundary(range.start);
264 }
265 if !self.assert_char_boundary::<false>(range.end) {
266 range.end = if range.end < range.start {
267 range.start
268 } else {
269 self.text.floor_char_boundary(range.end)
270 };
271 }
272 let mask = (1 as Bitmap)
273 .unbounded_shl(range.end as u32)
274 .wrapping_sub(1);
275 Self {
276 chars: (self.chars & mask) >> range.start,
277 chars_utf16: (self.chars_utf16 & mask) >> range.start,
278 newlines: (self.newlines & mask) >> range.start,
279 tabs: (self.tabs & mask) >> range.start,
280 text: &self.text[range],
281 }
282 }
283 }
284
285 #[inline(always)]
286 pub fn text_summary(&self) -> TextSummary {
287 let mut chars = 0;
288 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
289 TextSummary {
290 len: self.len(),
291 chars,
292 len_utf16: self.len_utf16(),
293 lines: self.lines(),
294 first_line_chars: self.first_line_chars(),
295 last_line_chars: self.last_line_chars(),
296 last_line_len_utf16: self.last_line_len_utf16(),
297 longest_row,
298 longest_row_chars,
299 }
300 }
301
302 /// Get length in bytes
303 #[inline(always)]
304 pub fn len(&self) -> usize {
305 self.text.len()
306 }
307
308 /// Get length in UTF-16 code units
309 #[inline(always)]
310 pub fn len_utf16(&self) -> OffsetUtf16 {
311 OffsetUtf16(self.chars_utf16.count_ones() as usize)
312 }
313
314 /// Get point representing number of lines and length of last line
315 #[inline(always)]
316 pub fn lines(&self) -> Point {
317 let row = self.newlines.count_ones();
318 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
319 Point::new(row, column)
320 }
321
322 /// Get number of chars in first line
323 #[inline(always)]
324 pub fn first_line_chars(&self) -> u32 {
325 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
326 }
327
328 /// Get number of chars in last line
329 #[inline(always)]
330 pub fn last_line_chars(&self) -> u32 {
331 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
332 }
333
334 /// Get number of UTF-16 code units in last line
335 #[inline(always)]
336 pub fn last_line_len_utf16(&self) -> u32 {
337 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
338 }
339
340 /// Get the longest row in the chunk and its length in characters.
341 /// Calculate the total number of characters in the chunk along the way.
342 #[inline(always)]
343 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
344 let mut chars = self.chars;
345 let mut newlines = self.newlines;
346 *total_chars = 0;
347 let mut row = 0;
348 let mut longest_row = 0;
349 let mut longest_row_chars = 0;
350 while newlines > 0 {
351 let newline_ix = newlines.trailing_zeros();
352 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
353 *total_chars += usize::from(row_chars);
354 if row_chars > longest_row_chars {
355 longest_row = row;
356 longest_row_chars = row_chars;
357 }
358
359 newlines >>= newline_ix;
360 newlines >>= 1;
361 chars >>= newline_ix;
362 chars >>= 1;
363 row += 1;
364 *total_chars += 1;
365 }
366
367 let row_chars = chars.count_ones() as u8;
368 *total_chars += usize::from(row_chars);
369 if row_chars > longest_row_chars {
370 (row, row_chars as u32)
371 } else {
372 (longest_row, longest_row_chars as u32)
373 }
374 }
375
376 #[inline(always)]
377 pub fn offset_to_point(&self, offset: usize) -> Point {
378 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
379 let row = (self.newlines & mask).count_ones();
380 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
381 let column = (offset - newline_ix as usize) as u32;
382 Point::new(row, column)
383 }
384
385 #[inline(always)]
386 pub fn point_to_offset(&self, point: Point) -> usize {
387 if point.row > self.lines().row {
388 debug_panic!(
389 "point {:?} extends beyond rows for string {:?}",
390 point,
391 self.text
392 );
393 return self.len();
394 }
395
396 let row_offset_range = self.offset_range_for_row(point.row);
397 if point.column > row_offset_range.len() as u32 {
398 debug_panic!(
399 "point {:?} extends beyond row for string {:?}",
400 point,
401 self.text
402 );
403 row_offset_range.end
404 } else {
405 row_offset_range.start + point.column as usize
406 }
407 }
408
409 #[track_caller]
410 #[inline(always)]
411 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
412 if self.is_char_boundary(offset) {
413 return true;
414 }
415 if PANIC {
416 panic_char_boundary(self.text, offset);
417 } else {
418 log_err_char_boundary(self.text, offset);
419 false
420 }
421 }
422
423 pub fn floor_char_boundary(&self, index: usize) -> usize {
424 self.text.floor_char_boundary(index)
425 }
426
427 #[inline(always)]
428 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
429 if point.row > self.lines().row {
430 debug_panic!(
431 "point {:?} extends beyond rows for string {:?}",
432 point,
433 self.text
434 );
435 return self.len_utf16();
436 }
437 self.offset_to_offset_utf16(self.point_to_offset(point))
438 }
439
440 #[inline(always)]
441 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
442 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
443 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
444 }
445
446 #[inline(always)]
447 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
448 if target.0 == 0 {
449 0
450 } else {
451 #[cfg(not(test))]
452 let chars_utf16 = self.chars_utf16;
453 #[cfg(test)]
454 let chars_utf16 = self.chars_utf16 as u128;
455 let ix = nth_set_bit(chars_utf16, target.0) + 1;
456 if ix == MAX_BASE {
457 MAX_BASE
458 } else {
459 let utf8_additional_len = cmp::min(
460 (self.chars_utf16 >> ix).trailing_zeros() as usize,
461 self.text.len() - ix,
462 );
463 ix + utf8_additional_len
464 }
465 }
466 }
467
468 #[inline(always)]
469 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
470 let mask = saturating_shl_mask(offset as u32);
471 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
472 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
473 let column = if newline_ix as usize == MAX_BASE {
474 0
475 } else {
476 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
477 };
478 PointUtf16::new(row, column)
479 }
480
481 #[inline(always)]
482 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
483 self.offset_to_point_utf16(self.point_to_offset(point))
484 }
485
486 #[inline(always)]
487 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
488 let lines = self.lines();
489 if point.row > lines.row {
490 if !clip {
491 debug_panic!(
492 "point {:?} is beyond this chunk's extent {:?}",
493 point,
494 self.text
495 );
496 }
497 return self.len();
498 }
499
500 let row_offset_range = self.offset_range_for_row(point.row);
501 let line = self.slice(row_offset_range.clone());
502 if point.column > line.last_line_len_utf16() {
503 if !clip {
504 debug_panic!(
505 "point {:?} is beyond the end of the line in chunk {:?}",
506 point,
507 self.text
508 );
509 }
510 return row_offset_range.end;
511 }
512
513 let mut offset = row_offset_range.start;
514 if point.column > 0 {
515 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
516 if !self.text.is_char_boundary(offset) {
517 offset -= 1;
518 while !self.text.is_char_boundary(offset) {
519 offset -= 1;
520 }
521 if !clip {
522 debug_panic!(
523 "point {:?} is within character in chunk {:?}",
524 point,
525 self.text,
526 );
527 }
528 }
529 }
530 offset
531 }
532
533 #[inline(always)]
534 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
535 let max_point = self.lines();
536 if point.0.row > max_point.row {
537 return max_point;
538 }
539
540 let row_offset_range = self.offset_range_for_row(point.0.row);
541 let line = self.slice(row_offset_range);
542 if point.0.column == 0 {
543 Point::new(point.0.row, 0)
544 } else if point.0.column >= line.len_utf16().0 as u32 {
545 Point::new(point.0.row, line.len() as u32)
546 } else {
547 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
548 while !line.text.is_char_boundary(column) {
549 column -= 1;
550 }
551 Point::new(point.0.row, column as u32)
552 }
553 }
554
555 #[inline(always)]
556 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
557 let max_point = self.lines();
558 if point.row > max_point.row {
559 return max_point;
560 }
561
562 let line = self.slice(self.offset_range_for_row(point.row));
563 if point.column == 0 {
564 point
565 } else if point.column >= line.len() as u32 {
566 Point::new(point.row, line.len() as u32)
567 } else {
568 let mut column = point.column as usize;
569 let bytes = line.text.as_bytes();
570 if bytes[column - 1] < 128 && bytes[column] < 128 {
571 return Point::new(point.row, column as u32);
572 }
573
574 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
575 loop {
576 if line.is_char_boundary(column)
577 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
578 {
579 break;
580 }
581
582 match bias {
583 Bias::Left => column -= 1,
584 Bias::Right => column += 1,
585 }
586 grapheme_cursor.set_cursor(column);
587 }
588 Point::new(point.row, column as u32)
589 }
590 }
591
592 #[inline(always)]
593 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
594 let max_point = self.lines();
595 if point.0.row > max_point.row {
596 PointUtf16::new(max_point.row, self.last_line_len_utf16())
597 } else {
598 let line = self.slice(self.offset_range_for_row(point.0.row));
599 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
600 PointUtf16::new(point.0.row, column.0 as u32)
601 }
602 }
603
604 #[inline(always)]
605 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
606 if target == OffsetUtf16::default() {
607 OffsetUtf16::default()
608 } else if target >= self.len_utf16() {
609 self.len_utf16()
610 } else {
611 let mut offset = self.offset_utf16_to_offset(target);
612 while !self.text.is_char_boundary(offset) {
613 if bias == Bias::Left {
614 offset -= 1;
615 } else {
616 offset += 1;
617 }
618 }
619 self.offset_to_offset_utf16(offset)
620 }
621 }
622
623 #[inline(always)]
624 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
625 let row_start = if row > 0 {
626 #[cfg(not(test))]
627 let newlines = self.newlines;
628 #[cfg(test)]
629 let newlines = self.newlines as u128;
630 nth_set_bit(newlines, row as usize) + 1
631 } else {
632 0
633 };
634 let row_len = if row_start == MAX_BASE {
635 0
636 } else {
637 cmp::min(
638 (self.newlines >> row_start).trailing_zeros(),
639 (self.text.len() - row_start) as u32,
640 )
641 };
642 row_start..row_start + row_len as usize
643 }
644
645 #[inline(always)]
646 pub fn tabs(&self) -> Tabs {
647 Tabs {
648 tabs: self.tabs,
649 chars: self.chars,
650 }
651 }
652}
653
654pub struct Tabs {
655 tabs: Bitmap,
656 chars: Bitmap,
657}
658
659#[derive(Debug, PartialEq, Eq)]
660pub struct TabPosition {
661 pub byte_offset: usize,
662 pub char_offset: usize,
663}
664
665impl Iterator for Tabs {
666 type Item = TabPosition;
667
668 fn next(&mut self) -> Option<Self::Item> {
669 if self.tabs == 0 {
670 return None;
671 }
672
673 let tab_offset = self.tabs.trailing_zeros() as usize;
674 let chars_mask = (1 << tab_offset) - 1;
675 let char_offset = (self.chars & chars_mask).count_ones() as usize;
676
677 // Since tabs are 1 byte the tab offset is the same as the byte offset
678 let position = TabPosition {
679 byte_offset: tab_offset,
680 char_offset,
681 };
682 // Remove the tab we've just seen
683 self.tabs ^= 1 << tab_offset;
684
685 Some(position)
686 }
687}
688
689/// Finds the n-th bit that is set to 1.
690#[inline(always)]
691fn nth_set_bit(v: u128, n: usize) -> usize {
692 let low = v as u64;
693 let high = (v >> 64) as u64;
694
695 let low_count = low.count_ones() as usize;
696 if n > low_count {
697 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
698 } else {
699 nth_set_bit_u64(low, n as u64) as usize
700 }
701}
702
703#[cold]
704#[inline(never)]
705#[track_caller]
706fn panic_char_boundary(text: &str, offset: usize) -> ! {
707 if offset > text.len() {
708 panic!(
709 "byte index {} is out of bounds of `{:?}` (length: {})",
710 offset,
711 text,
712 text.len()
713 );
714 }
715 // find the character
716 let char_start = text.floor_char_boundary(offset);
717 // `char_start` must be less than len and a char boundary
718 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
719 let char_range = char_start..char_start + ch.len_utf8();
720 panic!(
721 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
722 offset, ch, char_range,
723 );
724}
725
726#[cold]
727#[inline(never)]
728#[track_caller]
729fn log_err_char_boundary(text: &str, offset: usize) {
730 if offset >= text.len() {
731 log::error!(
732 "byte index {} is out of bounds of `{:?}` (length: {})",
733 offset,
734 text,
735 text.len()
736 );
737 return;
738 }
739 // find the character
740 let char_start = text.floor_char_boundary(offset);
741 // `char_start` must be less than len and a char boundary
742 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
743 let char_range = char_start..char_start + ch.len_utf8();
744 log::error!(
745 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
746 offset,
747 ch,
748 char_range,
749 );
750}
751
752#[inline(always)]
753fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
754 let v = v.reverse_bits();
755 let mut s: u64 = 64;
756
757 // Parallel bit count intermediates
758 let a = v - ((v >> 1) & (u64::MAX / 3));
759 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
760 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
761 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
762
763 // Branchless select
764 let t = (d >> 32) + (d >> 48);
765 s -= (t.wrapping_sub(n) & 256) >> 3;
766 n -= t & (t.wrapping_sub(n) >> 8);
767
768 let t = (d >> (s - 16)) & 0xff;
769 s -= (t.wrapping_sub(n) & 256) >> 4;
770 n -= t & (t.wrapping_sub(n) >> 8);
771
772 let t = (c >> (s - 8)) & 0xf;
773 s -= (t.wrapping_sub(n) & 256) >> 5;
774 n -= t & (t.wrapping_sub(n) >> 8);
775
776 let t = (b >> (s - 4)) & 0x7;
777 s -= (t.wrapping_sub(n) & 256) >> 6;
778 n -= t & (t.wrapping_sub(n) >> 8);
779
780 let t = (a >> (s - 2)) & 0x3;
781 s -= (t.wrapping_sub(n) & 256) >> 7;
782 n -= t & (t.wrapping_sub(n) >> 8);
783
784 let t = (v >> (s - 1)) & 0x1;
785 s -= (t.wrapping_sub(n) & 256) >> 8;
786
787 65 - s - 1
788}
789
790#[cfg(test)]
791mod tests {
792 use super::*;
793 use rand::prelude::*;
794 use util::RandomCharIter;
795
796 #[gpui::test(iterations = 100)]
797 fn test_random_chunks(mut rng: StdRng) {
798 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
799 log::info!("Chunk: {:?}", text);
800 let chunk = Chunk::new(&text);
801 verify_chunk(chunk.as_slice(), &text);
802
803 // Verify Chunk::chars() bitmap
804 let expected_chars = char_offsets(&text)
805 .into_iter()
806 .inspect(|i| assert!(*i < MAX_BASE))
807 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
808 assert_eq!(chunk.chars(), expected_chars);
809
810 for _ in 0..10 {
811 let mut start = rng.random_range(0..=chunk.text.len());
812 let mut end = rng.random_range(start..=chunk.text.len());
813 while !chunk.text.is_char_boundary(start) {
814 start -= 1;
815 }
816 while !chunk.text.is_char_boundary(end) {
817 end -= 1;
818 }
819 let range = start..end;
820 log::info!("Range: {:?}", range);
821 let text_slice = &text[range.clone()];
822 let chunk_slice = chunk.slice(range);
823 verify_chunk(chunk_slice, text_slice);
824 }
825 }
826
827 #[gpui::test(iterations = 100)]
828 fn test_split_chunk_slice(mut rng: StdRng) {
829 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
830 let chunk = Chunk::new(text);
831 let offset = char_offsets_with_end(text)
832 .into_iter()
833 .choose(&mut rng)
834 .unwrap();
835 let (a, b) = chunk.as_slice().split_at(offset);
836 let (a_str, b_str) = text.split_at(offset);
837 verify_chunk(a, a_str);
838 verify_chunk(b, b_str);
839 }
840
841 #[gpui::test(iterations = 1000)]
842 fn test_nth_set_bit_random(mut rng: StdRng) {
843 let set_count = rng.random_range(0..=128);
844 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
845 set_bits.sort();
846 let mut n = 0;
847 for ix in set_bits.iter().copied() {
848 n |= 1 << ix;
849 }
850
851 for (mut ix, position) in set_bits.into_iter().enumerate() {
852 ix += 1;
853 assert_eq!(
854 nth_set_bit(n, ix),
855 position,
856 "nth_set_bit({:0128b}, {})",
857 n,
858 ix
859 );
860 }
861 }
862
863 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
864 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
865 let mut str = String::new();
866 let mut chars = RandomCharIter::new(rng);
867 loop {
868 let ch = chars.next().unwrap();
869 if str.len() + ch.len_utf8() > len {
870 break;
871 }
872 str.push(ch);
873 }
874 str
875 }
876
877 #[gpui::test(iterations = 1000)]
878 fn test_append_random_strings(mut rng: StdRng) {
879 let len1 = rng.random_range(0..=MAX_BASE);
880 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
881 let str1 = random_string_with_utf8_len(&mut rng, len1);
882 let str2 = random_string_with_utf8_len(&mut rng, len2);
883 let mut chunk1 = Chunk::new(&str1);
884 let chunk2 = Chunk::new(&str2);
885 let char_offsets = char_offsets_with_end(&str2);
886 let start_index = rng.random_range(0..char_offsets.len());
887 let start_offset = char_offsets[start_index];
888 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
889 chunk1.append(chunk2.slice(start_offset..end_offset));
890 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
891 }
892
893 /// Return the byte offsets for each character in a string.
894 ///
895 /// These are valid offsets to split the string.
896 fn char_offsets(text: &str) -> Vec<usize> {
897 text.char_indices().map(|(i, _c)| i).collect()
898 }
899
900 /// Return the byte offsets for each character in a string, plus the offset
901 /// past the end of the string.
902 fn char_offsets_with_end(text: &str) -> Vec<usize> {
903 let mut v = char_offsets(text);
904 v.push(text.len());
905 v
906 }
907
908 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
909 let mut offset = 0;
910 let mut offset_utf16 = OffsetUtf16(0);
911 let mut point = Point::zero();
912 let mut point_utf16 = PointUtf16::zero();
913
914 log::info!("Verifying chunk {:?}", text);
915 assert_eq!(chunk.offset_to_point(0), Point::zero());
916
917 let mut expected_tab_positions = Vec::new();
918
919 for (char_offset, c) in text.chars().enumerate() {
920 let expected_point = chunk.offset_to_point(offset);
921 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
922 assert_eq!(
923 chunk.point_to_offset(point),
924 offset,
925 "mismatch at point {:?}",
926 point
927 );
928 assert_eq!(
929 chunk.offset_to_offset_utf16(offset),
930 offset_utf16,
931 "mismatch at offset {}",
932 offset
933 );
934 assert_eq!(
935 chunk.offset_utf16_to_offset(offset_utf16),
936 offset,
937 "mismatch at offset_utf16 {:?}",
938 offset_utf16
939 );
940 assert_eq!(
941 chunk.point_to_point_utf16(point),
942 point_utf16,
943 "mismatch at point {:?}",
944 point
945 );
946 assert_eq!(
947 chunk.point_utf16_to_offset(point_utf16, false),
948 offset,
949 "mismatch at point_utf16 {:?}",
950 point_utf16
951 );
952 assert_eq!(
953 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
954 point,
955 "mismatch for unclipped_point_utf16_to_point at {:?}",
956 point_utf16
957 );
958
959 assert_eq!(
960 chunk.clip_point(point, Bias::Left),
961 point,
962 "incorrect left clip at {:?}",
963 point
964 );
965 assert_eq!(
966 chunk.clip_point(point, Bias::Right),
967 point,
968 "incorrect right clip at {:?}",
969 point
970 );
971
972 for i in 1..c.len_utf8() {
973 let test_point = Point::new(point.row, point.column + i as u32);
974 assert_eq!(
975 chunk.clip_point(test_point, Bias::Left),
976 point,
977 "incorrect left clip within multi-byte char at {:?}",
978 test_point
979 );
980 assert_eq!(
981 chunk.clip_point(test_point, Bias::Right),
982 Point::new(point.row, point.column + c.len_utf8() as u32),
983 "incorrect right clip within multi-byte char at {:?}",
984 test_point
985 );
986 }
987
988 for i in 1..c.len_utf16() {
989 let test_point = Unclipped(PointUtf16::new(
990 point_utf16.row,
991 point_utf16.column + i as u32,
992 ));
993 assert_eq!(
994 chunk.unclipped_point_utf16_to_point(test_point),
995 point,
996 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
997 test_point
998 );
999 assert_eq!(
1000 chunk.clip_point_utf16(test_point, Bias::Left),
1001 point_utf16,
1002 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1003 test_point
1004 );
1005 assert_eq!(
1006 chunk.clip_point_utf16(test_point, Bias::Right),
1007 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1008 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1009 test_point
1010 );
1011
1012 let test_offset = OffsetUtf16(offset_utf16.0 + i);
1013 assert_eq!(
1014 chunk.clip_offset_utf16(test_offset, Bias::Left),
1015 offset_utf16,
1016 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1017 test_offset
1018 );
1019 assert_eq!(
1020 chunk.clip_offset_utf16(test_offset, Bias::Right),
1021 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1022 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1023 test_offset
1024 );
1025 }
1026
1027 if c == '\n' {
1028 point.row += 1;
1029 point.column = 0;
1030 point_utf16.row += 1;
1031 point_utf16.column = 0;
1032 } else {
1033 point.column += c.len_utf8() as u32;
1034 point_utf16.column += c.len_utf16() as u32;
1035 }
1036
1037 if c == '\t' {
1038 expected_tab_positions.push(TabPosition {
1039 byte_offset: offset,
1040 char_offset,
1041 });
1042 }
1043
1044 offset += c.len_utf8();
1045 offset_utf16.0 += c.len_utf16();
1046 }
1047
1048 let final_point = chunk.offset_to_point(offset);
1049 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1050 assert_eq!(
1051 chunk.point_to_offset(point),
1052 offset,
1053 "mismatch at point {:?}",
1054 point
1055 );
1056 assert_eq!(
1057 chunk.offset_to_offset_utf16(offset),
1058 offset_utf16,
1059 "mismatch at offset {}",
1060 offset
1061 );
1062 assert_eq!(
1063 chunk.offset_utf16_to_offset(offset_utf16),
1064 offset,
1065 "mismatch at offset_utf16 {:?}",
1066 offset_utf16
1067 );
1068 assert_eq!(
1069 chunk.point_to_point_utf16(point),
1070 point_utf16,
1071 "mismatch at final point {:?}",
1072 point
1073 );
1074 assert_eq!(
1075 chunk.point_utf16_to_offset(point_utf16, false),
1076 offset,
1077 "mismatch at final point_utf16 {:?}",
1078 point_utf16
1079 );
1080 assert_eq!(
1081 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1082 point,
1083 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1084 point_utf16
1085 );
1086 assert_eq!(
1087 chunk.clip_point(point, Bias::Left),
1088 point,
1089 "incorrect left clip at final point {:?}",
1090 point
1091 );
1092 assert_eq!(
1093 chunk.clip_point(point, Bias::Right),
1094 point,
1095 "incorrect right clip at final point {:?}",
1096 point
1097 );
1098 assert_eq!(
1099 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1100 point_utf16,
1101 "incorrect left clip_point_utf16 at final point {:?}",
1102 point_utf16
1103 );
1104 assert_eq!(
1105 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1106 point_utf16,
1107 "incorrect right clip_point_utf16 at final point {:?}",
1108 point_utf16
1109 );
1110 assert_eq!(
1111 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1112 offset_utf16,
1113 "incorrect left clip_offset_utf16 at final offset {:?}",
1114 offset_utf16
1115 );
1116 assert_eq!(
1117 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1118 offset_utf16,
1119 "incorrect right clip_offset_utf16 at final offset {:?}",
1120 offset_utf16
1121 );
1122
1123 // Verify length methods
1124 assert_eq!(chunk.len(), text.len());
1125 assert_eq!(
1126 chunk.len_utf16().0,
1127 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1128 );
1129
1130 // Verify line counting
1131 let lines = chunk.lines();
1132 let mut newline_count = 0;
1133 let mut last_line_len = 0;
1134 for c in text.chars() {
1135 if c == '\n' {
1136 newline_count += 1;
1137 last_line_len = 0;
1138 } else {
1139 last_line_len += c.len_utf8() as u32;
1140 }
1141 }
1142 assert_eq!(lines, Point::new(newline_count, last_line_len));
1143
1144 // Verify first/last line chars
1145 if !text.is_empty() {
1146 let first_line = text.split('\n').next().unwrap();
1147 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1148
1149 let last_line = text.split('\n').next_back().unwrap();
1150 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1151 assert_eq!(
1152 chunk.last_line_len_utf16(),
1153 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1154 );
1155 }
1156
1157 // Verify longest row
1158 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1159 let mut max_chars = 0;
1160 let mut current_row = 0;
1161 let mut current_chars = 0;
1162 let mut max_row = 0;
1163
1164 for c in text.chars() {
1165 if c == '\n' {
1166 if current_chars > max_chars {
1167 max_chars = current_chars;
1168 max_row = current_row;
1169 }
1170 current_row += 1;
1171 current_chars = 0;
1172 } else {
1173 current_chars += 1;
1174 }
1175 }
1176
1177 if current_chars > max_chars {
1178 max_chars = current_chars;
1179 max_row = current_row;
1180 }
1181
1182 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1183 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1184 }
1185
1186 #[gpui::test]
1187 fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1188 let text = "abc\nde";
1189 let chunk = Chunk::new(text);
1190 let slice = chunk.as_slice();
1191
1192 // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1193 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1194
1195 // Clipping on row 1 — this is the case that was buggy.
1196 // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1197 // clipped result must be 6, not 2.
1198 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1199 }
1200}