1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use heapless::String as ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE, u8>,
33}
34
35#[inline(always)]
36const fn saturating_shl_mask(offset: u32) -> Bitmap {
37 (1 as Bitmap).unbounded_shl(offset).wrapping_sub(1)
38}
39
40#[inline(always)]
41const fn saturating_shr_mask(offset: u32) -> Bitmap {
42 !Bitmap::MAX.unbounded_shr(offset)
43}
44
45impl Chunk {
46 pub const MASK_BITS: usize = Bitmap::BITS as usize;
47
48 #[inline(always)]
49 pub fn new(text: &str) -> Self {
50 let text = {
51 let mut buf = ArrayString::new();
52 buf.push_str(text).unwrap();
53 buf
54 };
55
56 const CHUNK_SIZE: usize = 8;
57
58 let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE];
59 let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE];
60 let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE];
61 let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE];
62
63 let mut chunk_ix = 0;
64
65 let mut bytes = text.as_bytes();
66 while !bytes.is_empty() {
67 let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE));
68 bytes = rest;
69
70 let mut chars = 0;
71 let mut newlines = 0;
72 let mut tabs = 0;
73 let mut chars_utf16 = 0;
74
75 for (ix, &b) in chunk.iter().enumerate() {
76 chars |= (util::is_utf8_char_boundary(b) as u8) << ix;
77 newlines |= ((b == b'\n') as u8) << ix;
78 tabs |= ((b == b'\t') as u8) << ix;
79 // b >= 240 when we are at the first byte of the 4 byte encoded
80 // utf-8 code point (U+010000 or greater) it means that it would
81 // be encoded as two 16-bit code units in utf-16
82 chars_utf16 |= ((b >= 240) as u8) << ix;
83 }
84
85 chars_bytes[chunk_ix] = chars;
86 newlines_bytes[chunk_ix] = newlines;
87 tabs_bytes[chunk_ix] = tabs;
88 chars_utf16_bytes[chunk_ix] = chars_utf16;
89
90 chunk_ix += 1;
91 }
92
93 let chars = Bitmap::from_le_bytes(chars_bytes);
94
95 Chunk {
96 text,
97 chars,
98 chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars,
99 newlines: Bitmap::from_le_bytes(newlines_bytes),
100 tabs: Bitmap::from_le_bytes(tabs_bytes),
101 }
102 }
103
104 #[inline(always)]
105 pub fn push_str(&mut self, text: &str) {
106 self.append(Chunk::new(text).as_slice());
107 }
108
109 #[inline(always)]
110 pub fn prepend_str(&mut self, text: &str) {
111 self.prepend(Chunk::new(text).as_slice());
112 }
113
114 #[inline(always)]
115 pub fn append(&mut self, slice: ChunkSlice) {
116 if slice.is_empty() {
117 return;
118 };
119
120 let base_ix = self.text.len();
121 self.chars |= slice.chars << base_ix;
122 self.chars_utf16 |= slice.chars_utf16 << base_ix;
123 self.newlines |= slice.newlines << base_ix;
124 self.tabs |= slice.tabs << base_ix;
125 self.text.push_str(slice.text).unwrap();
126 }
127
128 #[inline(always)]
129 pub fn prepend(&mut self, slice: ChunkSlice) {
130 if slice.is_empty() {
131 return;
132 }
133 if self.text.is_empty() {
134 *self = Chunk::new(slice.text);
135 return;
136 }
137
138 let shift = slice.text.len();
139 self.chars = slice.chars | (self.chars << shift);
140 self.chars_utf16 = slice.chars_utf16 | (self.chars_utf16 << shift);
141 self.newlines = slice.newlines | (self.newlines << shift);
142 self.tabs = slice.tabs | (self.tabs << shift);
143
144 let mut new_text = ArrayString::<MAX_BASE, u8>::new();
145 new_text.push_str(slice.text).unwrap();
146 new_text.push_str(&self.text).unwrap();
147 self.text = new_text;
148 }
149
150 #[inline(always)]
151 pub fn as_slice(&self) -> ChunkSlice<'_> {
152 ChunkSlice {
153 chars: self.chars,
154 chars_utf16: self.chars_utf16,
155 newlines: self.newlines,
156 tabs: self.tabs,
157 text: &self.text,
158 }
159 }
160
161 #[inline(always)]
162 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
163 self.as_slice().slice(range)
164 }
165
166 #[inline(always)]
167 pub fn chars(&self) -> Bitmap {
168 self.chars
169 }
170
171 #[inline(always)]
172 pub fn tabs(&self) -> Bitmap {
173 self.tabs
174 }
175
176 #[inline(always)]
177 pub fn newlines(&self) -> Bitmap {
178 self.newlines
179 }
180
181 #[inline(always)]
182 pub fn is_char_boundary(&self, offset: usize) -> bool {
183 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
184 }
185
186 pub fn floor_char_boundary(&self, index: usize) -> usize {
187 if index >= self.text.len() {
188 self.text.len()
189 } else {
190 let mut i = index;
191 while i > 0 {
192 if util::is_utf8_char_boundary(self.text.as_bytes()[i]) {
193 break;
194 }
195 i -= 1;
196 }
197
198 i
199 }
200 }
201
202 #[track_caller]
203 #[inline(always)]
204 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
205 if self.is_char_boundary(offset) {
206 return true;
207 }
208 if PANIC || cfg!(debug_assertions) {
209 panic_char_boundary(&self.text, offset);
210 } else {
211 log_err_char_boundary(&self.text, offset);
212 false
213 }
214 }
215}
216
217#[derive(Clone, Copy, Debug)]
218pub struct ChunkSlice<'a> {
219 chars: Bitmap,
220 chars_utf16: Bitmap,
221 newlines: Bitmap,
222 tabs: Bitmap,
223 text: &'a str,
224}
225
226impl Into<Chunk> for ChunkSlice<'_> {
227 fn into(self) -> Chunk {
228 Chunk {
229 chars: self.chars,
230 chars_utf16: self.chars_utf16,
231 newlines: self.newlines,
232 tabs: self.tabs,
233 text: self.text.try_into().unwrap(),
234 }
235 }
236}
237
238impl<'a> ChunkSlice<'a> {
239 #[inline(always)]
240 pub fn is_empty(&self) -> bool {
241 self.text.is_empty()
242 }
243
244 #[inline(always)]
245 pub fn is_char_boundary(&self, offset: usize) -> bool {
246 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
247 }
248
249 #[inline(always)]
250 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
251 if mid == MAX_BASE {
252 let left = self;
253 let right = ChunkSlice {
254 chars: 0,
255 chars_utf16: 0,
256 newlines: 0,
257 tabs: 0,
258 text: "",
259 };
260 (left, right)
261 } else {
262 let mask = ((1 as Bitmap) << mid) - 1;
263 let (left_text, right_text) = self.text.split_at(mid);
264 let left = ChunkSlice {
265 chars: self.chars & mask,
266 chars_utf16: self.chars_utf16 & mask,
267 newlines: self.newlines & mask,
268 tabs: self.tabs & mask,
269 text: left_text,
270 };
271 let right = ChunkSlice {
272 chars: self.chars >> mid,
273 chars_utf16: self.chars_utf16 >> mid,
274 newlines: self.newlines >> mid,
275 tabs: self.tabs >> mid,
276 text: right_text,
277 };
278 (left, right)
279 }
280 }
281
282 #[inline(always)]
283 pub fn slice(self, mut range: Range<usize>) -> Self {
284 if range.start == MAX_BASE {
285 Self {
286 chars: 0,
287 chars_utf16: 0,
288 newlines: 0,
289 tabs: 0,
290 text: "",
291 }
292 } else {
293 if !self.assert_char_boundary::<false>(range.start) {
294 range.start = self.text.ceil_char_boundary(range.start);
295 }
296 if !self.assert_char_boundary::<false>(range.end) {
297 range.end = if range.end < range.start {
298 range.start
299 } else {
300 self.text.floor_char_boundary(range.end)
301 };
302 }
303 let mask = (1 as Bitmap)
304 .unbounded_shl(range.end as u32)
305 .wrapping_sub(1);
306 Self {
307 chars: (self.chars & mask) >> range.start,
308 chars_utf16: (self.chars_utf16 & mask) >> range.start,
309 newlines: (self.newlines & mask) >> range.start,
310 tabs: (self.tabs & mask) >> range.start,
311 text: &self.text[range],
312 }
313 }
314 }
315
316 #[inline(always)]
317 pub fn text_summary(&self) -> TextSummary {
318 let mut chars = 0;
319 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
320 TextSummary {
321 len: self.len(),
322 chars,
323 len_utf16: self.len_utf16(),
324 lines: self.lines(),
325 first_line_chars: self.first_line_chars(),
326 last_line_chars: self.last_line_chars(),
327 last_line_len_utf16: self.last_line_len_utf16(),
328 longest_row,
329 longest_row_chars,
330 }
331 }
332
333 /// Get length in bytes
334 #[inline(always)]
335 pub fn len(&self) -> usize {
336 self.text.len()
337 }
338
339 /// Get length in UTF-16 code units
340 #[inline(always)]
341 pub fn len_utf16(&self) -> OffsetUtf16 {
342 OffsetUtf16(self.chars_utf16.count_ones() as usize)
343 }
344
345 /// Get point representing number of lines and length of last line
346 #[inline(always)]
347 pub fn lines(&self) -> Point {
348 let row = self.newlines.count_ones();
349 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
350 Point::new(row, column)
351 }
352
353 /// Get number of chars in first line
354 #[inline(always)]
355 pub fn first_line_chars(&self) -> u32 {
356 (self.chars & saturating_shl_mask(self.newlines.trailing_zeros())).count_ones()
357 }
358
359 /// Get number of chars in last line
360 #[inline(always)]
361 pub fn last_line_chars(&self) -> u32 {
362 (self.chars & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
363 }
364
365 /// Get number of UTF-16 code units in last line
366 #[inline(always)]
367 pub fn last_line_len_utf16(&self) -> u32 {
368 (self.chars_utf16 & saturating_shr_mask(self.newlines.leading_zeros())).count_ones()
369 }
370
371 /// Get the longest row in the chunk and its length in characters.
372 /// Calculate the total number of characters in the chunk along the way.
373 #[inline(always)]
374 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
375 let mut chars = self.chars;
376 let mut newlines = self.newlines;
377 *total_chars = 0;
378 let mut row = 0;
379 let mut longest_row = 0;
380 let mut longest_row_chars = 0;
381 while newlines > 0 {
382 let newline_ix = newlines.trailing_zeros();
383 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
384 *total_chars += usize::from(row_chars);
385 if row_chars > longest_row_chars {
386 longest_row = row;
387 longest_row_chars = row_chars;
388 }
389
390 newlines >>= newline_ix;
391 newlines >>= 1;
392 chars >>= newline_ix;
393 chars >>= 1;
394 row += 1;
395 *total_chars += 1;
396 }
397
398 let row_chars = chars.count_ones() as u8;
399 *total_chars += usize::from(row_chars);
400 if row_chars > longest_row_chars {
401 (row, row_chars as u32)
402 } else {
403 (longest_row, longest_row_chars as u32)
404 }
405 }
406
407 #[inline(always)]
408 pub fn offset_to_point(&self, offset: usize) -> Point {
409 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
410 let row = (self.newlines & mask).count_ones();
411 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
412 let column = (offset - newline_ix as usize) as u32;
413 Point::new(row, column)
414 }
415
416 #[inline(always)]
417 pub fn point_to_offset(&self, point: Point) -> usize {
418 if point.row > self.lines().row {
419 debug_panic!(
420 "point {:?} extends beyond rows for string {:?}",
421 point,
422 self.text
423 );
424 return self.len();
425 }
426
427 let row_offset_range = self.offset_range_for_row(point.row);
428 if point.column > row_offset_range.len() as u32 {
429 debug_panic!(
430 "point {:?} extends beyond row for string {:?}",
431 point,
432 self.text
433 );
434 row_offset_range.end
435 } else {
436 row_offset_range.start + point.column as usize
437 }
438 }
439
440 #[track_caller]
441 #[inline(always)]
442 pub fn assert_char_boundary<const PANIC: bool>(&self, offset: usize) -> bool {
443 if self.is_char_boundary(offset) {
444 return true;
445 }
446 if PANIC {
447 panic_char_boundary(self.text, offset);
448 } else {
449 log_err_char_boundary(self.text, offset);
450 false
451 }
452 }
453
454 pub fn floor_char_boundary(&self, index: usize) -> usize {
455 self.text.floor_char_boundary(index)
456 }
457
458 #[inline(always)]
459 pub fn point_to_offset_utf16(&self, point: Point) -> OffsetUtf16 {
460 if point.row > self.lines().row {
461 debug_panic!(
462 "point {:?} extends beyond rows for string {:?}",
463 point,
464 self.text
465 );
466 return self.len_utf16();
467 }
468 self.offset_to_offset_utf16(self.point_to_offset(point))
469 }
470
471 #[inline(always)]
472 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
473 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
474 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
475 }
476
477 #[inline(always)]
478 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
479 if target.0 == 0 {
480 0
481 } else {
482 #[cfg(not(test))]
483 let chars_utf16 = self.chars_utf16;
484 #[cfg(test)]
485 let chars_utf16 = self.chars_utf16 as u128;
486 let ix = nth_set_bit(chars_utf16, target.0) + 1;
487 if ix == MAX_BASE {
488 MAX_BASE
489 } else {
490 let utf8_additional_len = cmp::min(
491 (self.chars_utf16 >> ix).trailing_zeros() as usize,
492 self.text.len() - ix,
493 );
494 ix + utf8_additional_len
495 }
496 }
497 }
498
499 #[inline(always)]
500 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
501 let mask = saturating_shl_mask(offset as u32);
502 let row = (self.newlines & saturating_shl_mask(offset as u32)).count_ones();
503 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
504 let column = if newline_ix as usize == MAX_BASE {
505 0
506 } else {
507 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
508 };
509 PointUtf16::new(row, column)
510 }
511
512 #[inline(always)]
513 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
514 self.offset_to_point_utf16(self.point_to_offset(point))
515 }
516
517 #[inline(always)]
518 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
519 let lines = self.lines();
520 if point.row > lines.row {
521 if !clip {
522 debug_panic!(
523 "point {:?} is beyond this chunk's extent {:?}",
524 point,
525 self.text
526 );
527 }
528 return self.len();
529 }
530
531 let row_offset_range = self.offset_range_for_row(point.row);
532 let line = self.slice(row_offset_range.clone());
533 if point.column > line.last_line_len_utf16() {
534 if !clip {
535 debug_panic!(
536 "point {:?} is beyond the end of the line in chunk {:?}",
537 point,
538 self.text
539 );
540 }
541 return row_offset_range.end;
542 }
543
544 let mut offset = row_offset_range.start;
545 if point.column > 0 {
546 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
547 if !self.text.is_char_boundary(offset) {
548 offset -= 1;
549 while !self.text.is_char_boundary(offset) {
550 offset -= 1;
551 }
552 if !clip {
553 debug_panic!(
554 "point {:?} is within character in chunk {:?}",
555 point,
556 self.text,
557 );
558 }
559 }
560 }
561 offset
562 }
563
564 #[inline(always)]
565 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
566 let max_point = self.lines();
567 if point.0.row > max_point.row {
568 return max_point;
569 }
570
571 let row_offset_range = self.offset_range_for_row(point.0.row);
572 let line = self.slice(row_offset_range);
573 if point.0.column == 0 {
574 Point::new(point.0.row, 0)
575 } else if point.0.column >= line.len_utf16().0 as u32 {
576 Point::new(point.0.row, line.len() as u32)
577 } else {
578 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
579 while !line.text.is_char_boundary(column) {
580 column -= 1;
581 }
582 Point::new(point.0.row, column as u32)
583 }
584 }
585
586 #[inline(always)]
587 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
588 let max_point = self.lines();
589 if point.row > max_point.row {
590 return max_point;
591 }
592
593 let line = self.slice(self.offset_range_for_row(point.row));
594 if point.column == 0 {
595 point
596 } else if point.column >= line.len() as u32 {
597 Point::new(point.row, line.len() as u32)
598 } else {
599 let mut column = point.column as usize;
600 let bytes = line.text.as_bytes();
601 if bytes[column - 1] < 128 && bytes[column] < 128 {
602 return Point::new(point.row, column as u32);
603 }
604
605 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
606 loop {
607 if line.is_char_boundary(column)
608 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
609 {
610 break;
611 }
612
613 match bias {
614 Bias::Left => column -= 1,
615 Bias::Right => column += 1,
616 }
617 grapheme_cursor.set_cursor(column);
618 }
619 Point::new(point.row, column as u32)
620 }
621 }
622
623 #[inline(always)]
624 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
625 let max_point = self.lines();
626 if point.0.row > max_point.row {
627 PointUtf16::new(max_point.row, self.last_line_len_utf16())
628 } else {
629 let line = self.slice(self.offset_range_for_row(point.0.row));
630 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
631 PointUtf16::new(point.0.row, column.0 as u32)
632 }
633 }
634
635 #[inline(always)]
636 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
637 if target == OffsetUtf16::default() {
638 OffsetUtf16::default()
639 } else if target >= self.len_utf16() {
640 self.len_utf16()
641 } else {
642 let mut offset = self.offset_utf16_to_offset(target);
643 while !self.text.is_char_boundary(offset) {
644 if bias == Bias::Left {
645 offset -= 1;
646 } else {
647 offset += 1;
648 }
649 }
650 self.offset_to_offset_utf16(offset)
651 }
652 }
653
654 #[inline(always)]
655 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
656 let row_start = if row > 0 {
657 #[cfg(not(test))]
658 let newlines = self.newlines;
659 #[cfg(test)]
660 let newlines = self.newlines as u128;
661 nth_set_bit(newlines, row as usize) + 1
662 } else {
663 0
664 };
665 let row_len = if row_start == MAX_BASE {
666 0
667 } else {
668 cmp::min(
669 (self.newlines >> row_start).trailing_zeros(),
670 (self.text.len() - row_start) as u32,
671 )
672 };
673 row_start..row_start + row_len as usize
674 }
675
676 #[inline(always)]
677 pub fn tabs(&self) -> Tabs {
678 Tabs {
679 tabs: self.tabs,
680 chars: self.chars,
681 }
682 }
683}
684
685pub struct Tabs {
686 tabs: Bitmap,
687 chars: Bitmap,
688}
689
690#[derive(Debug, PartialEq, Eq)]
691pub struct TabPosition {
692 pub byte_offset: usize,
693 pub char_offset: usize,
694}
695
696impl Iterator for Tabs {
697 type Item = TabPosition;
698
699 fn next(&mut self) -> Option<Self::Item> {
700 if self.tabs == 0 {
701 return None;
702 }
703
704 let tab_offset = self.tabs.trailing_zeros() as usize;
705 let chars_mask = (1 << tab_offset) - 1;
706 let char_offset = (self.chars & chars_mask).count_ones() as usize;
707
708 // Since tabs are 1 byte the tab offset is the same as the byte offset
709 let position = TabPosition {
710 byte_offset: tab_offset,
711 char_offset,
712 };
713 // Remove the tab we've just seen
714 self.tabs ^= 1 << tab_offset;
715
716 Some(position)
717 }
718}
719
720/// Finds the n-th bit that is set to 1.
721#[inline(always)]
722fn nth_set_bit(v: u128, n: usize) -> usize {
723 let low = v as u64;
724 let high = (v >> 64) as u64;
725
726 let low_count = low.count_ones() as usize;
727 if n > low_count {
728 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
729 } else {
730 nth_set_bit_u64(low, n as u64) as usize
731 }
732}
733
734#[cold]
735#[inline(never)]
736#[track_caller]
737fn panic_char_boundary(text: &str, offset: usize) -> ! {
738 if offset > text.len() {
739 panic!(
740 "byte index {} is out of bounds of `{:?}` (length: {})",
741 offset,
742 text,
743 text.len()
744 );
745 }
746 // find the character
747 let char_start = text.floor_char_boundary(offset);
748 // `char_start` must be less than len and a char boundary
749 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
750 let char_range = char_start..char_start + ch.len_utf8();
751 panic!(
752 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
753 offset, ch, char_range,
754 );
755}
756
757#[cold]
758#[inline(never)]
759#[track_caller]
760fn log_err_char_boundary(text: &str, offset: usize) {
761 if offset >= text.len() {
762 log::error!(
763 "byte index {} is out of bounds of `{:?}` (length: {})",
764 offset,
765 text,
766 text.len()
767 );
768 return;
769 }
770 // find the character
771 let char_start = text.floor_char_boundary(offset);
772 // `char_start` must be less than len and a char boundary
773 let ch = text.get(char_start..).unwrap().chars().next().unwrap();
774 let char_range = char_start..char_start + ch.len_utf8();
775 log::error!(
776 "byte index {} is not a char boundary; it is inside {:?} (bytes {:?})",
777 offset,
778 ch,
779 char_range,
780 );
781}
782
783#[inline(always)]
784fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
785 let v = v.reverse_bits();
786 let mut s: u64 = 64;
787
788 // Parallel bit count intermediates
789 let a = v - ((v >> 1) & (u64::MAX / 3));
790 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
791 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
792 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
793
794 // Branchless select
795 let t = (d >> 32) + (d >> 48);
796 s -= (t.wrapping_sub(n) & 256) >> 3;
797 n -= t & (t.wrapping_sub(n) >> 8);
798
799 let t = (d >> (s - 16)) & 0xff;
800 s -= (t.wrapping_sub(n) & 256) >> 4;
801 n -= t & (t.wrapping_sub(n) >> 8);
802
803 let t = (c >> (s - 8)) & 0xf;
804 s -= (t.wrapping_sub(n) & 256) >> 5;
805 n -= t & (t.wrapping_sub(n) >> 8);
806
807 let t = (b >> (s - 4)) & 0x7;
808 s -= (t.wrapping_sub(n) & 256) >> 6;
809 n -= t & (t.wrapping_sub(n) >> 8);
810
811 let t = (a >> (s - 2)) & 0x3;
812 s -= (t.wrapping_sub(n) & 256) >> 7;
813 n -= t & (t.wrapping_sub(n) >> 8);
814
815 let t = (v >> (s - 1)) & 0x1;
816 s -= (t.wrapping_sub(n) & 256) >> 8;
817
818 65 - s - 1
819}
820
821#[cfg(test)]
822mod tests {
823 use super::*;
824 use rand::prelude::*;
825 use util::RandomCharIter;
826
827 #[gpui::test(iterations = 100)]
828 fn test_random_chunks(mut rng: StdRng) {
829 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
830 log::info!("Chunk: {:?}", text);
831 let chunk = Chunk::new(&text);
832 verify_chunk(chunk.as_slice(), &text);
833
834 // Verify Chunk::chars() bitmap
835 let expected_chars = char_offsets(&text)
836 .into_iter()
837 .inspect(|i| assert!(*i < MAX_BASE))
838 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
839 assert_eq!(chunk.chars(), expected_chars);
840
841 for _ in 0..10 {
842 let mut start = rng.random_range(0..=chunk.text.len());
843 let mut end = rng.random_range(start..=chunk.text.len());
844 while !chunk.text.is_char_boundary(start) {
845 start -= 1;
846 }
847 while !chunk.text.is_char_boundary(end) {
848 end -= 1;
849 }
850 let range = start..end;
851 log::info!("Range: {:?}", range);
852 let text_slice = &text[range.clone()];
853 let chunk_slice = chunk.slice(range);
854 verify_chunk(chunk_slice, text_slice);
855 }
856 }
857
858 #[gpui::test(iterations = 100)]
859 fn test_split_chunk_slice(mut rng: StdRng) {
860 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
861 let chunk = Chunk::new(text);
862 let offset = char_offsets_with_end(text)
863 .into_iter()
864 .choose(&mut rng)
865 .unwrap();
866 let (a, b) = chunk.as_slice().split_at(offset);
867 let (a_str, b_str) = text.split_at(offset);
868 verify_chunk(a, a_str);
869 verify_chunk(b, b_str);
870 }
871
872 #[gpui::test(iterations = 1000)]
873 fn test_nth_set_bit_random(mut rng: StdRng) {
874 let set_count = rng.random_range(0..=128);
875 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
876 set_bits.sort();
877 let mut n = 0;
878 for ix in set_bits.iter().copied() {
879 n |= 1 << ix;
880 }
881
882 for (mut ix, position) in set_bits.into_iter().enumerate() {
883 ix += 1;
884 assert_eq!(
885 nth_set_bit(n, ix),
886 position,
887 "nth_set_bit({:0128b}, {})",
888 n,
889 ix
890 );
891 }
892 }
893
894 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
895 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
896 let mut str = String::new();
897 let mut chars = RandomCharIter::new(rng);
898 loop {
899 let ch = chars.next().unwrap();
900 if str.len() + ch.len_utf8() > len {
901 break;
902 }
903 str.push(ch);
904 }
905 str
906 }
907
908 #[gpui::test(iterations = 1000)]
909 fn test_append_random_strings(mut rng: StdRng) {
910 let len1 = rng.random_range(0..=MAX_BASE);
911 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
912 let str1 = random_string_with_utf8_len(&mut rng, len1);
913 let str2 = random_string_with_utf8_len(&mut rng, len2);
914 let mut chunk1 = Chunk::new(&str1);
915 let chunk2 = Chunk::new(&str2);
916 let char_offsets = char_offsets_with_end(&str2);
917 let start_index = rng.random_range(0..char_offsets.len());
918 let start_offset = char_offsets[start_index];
919 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
920 chunk1.append(chunk2.slice(start_offset..end_offset));
921 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
922 }
923
924 #[gpui::test(iterations = 1000)]
925 fn test_prepend_random_strings(mut rng: StdRng) {
926 let len1 = rng.random_range(0..=MAX_BASE);
927 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
928 let str1 = random_string_with_utf8_len(&mut rng, len1);
929 let str2 = random_string_with_utf8_len(&mut rng, len2);
930 let mut chunk1 = Chunk::new(&str1);
931 let chunk2 = Chunk::new(&str2);
932 let char_offsets = char_offsets_with_end(&str2);
933 let start_index = rng.random_range(0..char_offsets.len());
934 let start_offset = char_offsets[start_index];
935 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
936 let slice = chunk2.slice(start_offset..end_offset);
937 let prefix_text = &str2[start_offset..end_offset];
938 chunk1.prepend(slice);
939 verify_chunk(chunk1.as_slice(), &(prefix_text.to_owned() + &str1));
940 }
941
942 /// Return the byte offsets for each character in a string.
943 ///
944 /// These are valid offsets to split the string.
945 fn char_offsets(text: &str) -> Vec<usize> {
946 text.char_indices().map(|(i, _c)| i).collect()
947 }
948
949 /// Return the byte offsets for each character in a string, plus the offset
950 /// past the end of the string.
951 fn char_offsets_with_end(text: &str) -> Vec<usize> {
952 let mut v = char_offsets(text);
953 v.push(text.len());
954 v
955 }
956
957 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
958 let mut offset = 0;
959 let mut offset_utf16 = OffsetUtf16(0);
960 let mut point = Point::zero();
961 let mut point_utf16 = PointUtf16::zero();
962
963 log::info!("Verifying chunk {:?}", text);
964 assert_eq!(chunk.offset_to_point(0), Point::zero());
965
966 let mut expected_tab_positions = Vec::new();
967
968 for (char_offset, c) in text.chars().enumerate() {
969 let expected_point = chunk.offset_to_point(offset);
970 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
971 assert_eq!(
972 chunk.point_to_offset(point),
973 offset,
974 "mismatch at point {:?}",
975 point
976 );
977 assert_eq!(
978 chunk.offset_to_offset_utf16(offset),
979 offset_utf16,
980 "mismatch at offset {}",
981 offset
982 );
983 assert_eq!(
984 chunk.offset_utf16_to_offset(offset_utf16),
985 offset,
986 "mismatch at offset_utf16 {:?}",
987 offset_utf16
988 );
989 assert_eq!(
990 chunk.point_to_point_utf16(point),
991 point_utf16,
992 "mismatch at point {:?}",
993 point
994 );
995 assert_eq!(
996 chunk.point_utf16_to_offset(point_utf16, false),
997 offset,
998 "mismatch at point_utf16 {:?}",
999 point_utf16
1000 );
1001 assert_eq!(
1002 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1003 point,
1004 "mismatch for unclipped_point_utf16_to_point at {:?}",
1005 point_utf16
1006 );
1007
1008 assert_eq!(
1009 chunk.clip_point(point, Bias::Left),
1010 point,
1011 "incorrect left clip at {:?}",
1012 point
1013 );
1014 assert_eq!(
1015 chunk.clip_point(point, Bias::Right),
1016 point,
1017 "incorrect right clip at {:?}",
1018 point
1019 );
1020
1021 for i in 1..c.len_utf8() {
1022 let test_point = Point::new(point.row, point.column + i as u32);
1023 assert_eq!(
1024 chunk.clip_point(test_point, Bias::Left),
1025 point,
1026 "incorrect left clip within multi-byte char at {:?}",
1027 test_point
1028 );
1029 assert_eq!(
1030 chunk.clip_point(test_point, Bias::Right),
1031 Point::new(point.row, point.column + c.len_utf8() as u32),
1032 "incorrect right clip within multi-byte char at {:?}",
1033 test_point
1034 );
1035 }
1036
1037 for i in 1..c.len_utf16() {
1038 let test_point = Unclipped(PointUtf16::new(
1039 point_utf16.row,
1040 point_utf16.column + i as u32,
1041 ));
1042 assert_eq!(
1043 chunk.unclipped_point_utf16_to_point(test_point),
1044 point,
1045 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
1046 test_point
1047 );
1048 assert_eq!(
1049 chunk.clip_point_utf16(test_point, Bias::Left),
1050 point_utf16,
1051 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
1052 test_point
1053 );
1054 assert_eq!(
1055 chunk.clip_point_utf16(test_point, Bias::Right),
1056 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
1057 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
1058 test_point
1059 );
1060
1061 let test_offset = OffsetUtf16(offset_utf16.0 + i);
1062 assert_eq!(
1063 chunk.clip_offset_utf16(test_offset, Bias::Left),
1064 offset_utf16,
1065 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
1066 test_offset
1067 );
1068 assert_eq!(
1069 chunk.clip_offset_utf16(test_offset, Bias::Right),
1070 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
1071 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
1072 test_offset
1073 );
1074 }
1075
1076 if c == '\n' {
1077 point.row += 1;
1078 point.column = 0;
1079 point_utf16.row += 1;
1080 point_utf16.column = 0;
1081 } else {
1082 point.column += c.len_utf8() as u32;
1083 point_utf16.column += c.len_utf16() as u32;
1084 }
1085
1086 if c == '\t' {
1087 expected_tab_positions.push(TabPosition {
1088 byte_offset: offset,
1089 char_offset,
1090 });
1091 }
1092
1093 offset += c.len_utf8();
1094 offset_utf16.0 += c.len_utf16();
1095 }
1096
1097 let final_point = chunk.offset_to_point(offset);
1098 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
1099 assert_eq!(
1100 chunk.point_to_offset(point),
1101 offset,
1102 "mismatch at point {:?}",
1103 point
1104 );
1105 assert_eq!(
1106 chunk.offset_to_offset_utf16(offset),
1107 offset_utf16,
1108 "mismatch at offset {}",
1109 offset
1110 );
1111 assert_eq!(
1112 chunk.offset_utf16_to_offset(offset_utf16),
1113 offset,
1114 "mismatch at offset_utf16 {:?}",
1115 offset_utf16
1116 );
1117 assert_eq!(
1118 chunk.point_to_point_utf16(point),
1119 point_utf16,
1120 "mismatch at final point {:?}",
1121 point
1122 );
1123 assert_eq!(
1124 chunk.point_utf16_to_offset(point_utf16, false),
1125 offset,
1126 "mismatch at final point_utf16 {:?}",
1127 point_utf16
1128 );
1129 assert_eq!(
1130 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
1131 point,
1132 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
1133 point_utf16
1134 );
1135 assert_eq!(
1136 chunk.clip_point(point, Bias::Left),
1137 point,
1138 "incorrect left clip at final point {:?}",
1139 point
1140 );
1141 assert_eq!(
1142 chunk.clip_point(point, Bias::Right),
1143 point,
1144 "incorrect right clip at final point {:?}",
1145 point
1146 );
1147 assert_eq!(
1148 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
1149 point_utf16,
1150 "incorrect left clip_point_utf16 at final point {:?}",
1151 point_utf16
1152 );
1153 assert_eq!(
1154 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
1155 point_utf16,
1156 "incorrect right clip_point_utf16 at final point {:?}",
1157 point_utf16
1158 );
1159 assert_eq!(
1160 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
1161 offset_utf16,
1162 "incorrect left clip_offset_utf16 at final offset {:?}",
1163 offset_utf16
1164 );
1165 assert_eq!(
1166 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
1167 offset_utf16,
1168 "incorrect right clip_offset_utf16 at final offset {:?}",
1169 offset_utf16
1170 );
1171
1172 // Verify length methods
1173 assert_eq!(chunk.len(), text.len());
1174 assert_eq!(
1175 chunk.len_utf16().0,
1176 text.chars().map(|c| c.len_utf16()).sum::<usize>()
1177 );
1178
1179 // Verify line counting
1180 let lines = chunk.lines();
1181 let mut newline_count = 0;
1182 let mut last_line_len = 0;
1183 for c in text.chars() {
1184 if c == '\n' {
1185 newline_count += 1;
1186 last_line_len = 0;
1187 } else {
1188 last_line_len += c.len_utf8() as u32;
1189 }
1190 }
1191 assert_eq!(lines, Point::new(newline_count, last_line_len));
1192
1193 // Verify first/last line chars
1194 if !text.is_empty() {
1195 let first_line = text.split('\n').next().unwrap();
1196 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1197
1198 let last_line = text.split('\n').next_back().unwrap();
1199 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1200 assert_eq!(
1201 chunk.last_line_len_utf16(),
1202 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1203 );
1204 }
1205
1206 // Verify longest row
1207 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1208 let mut max_chars = 0;
1209 let mut current_row = 0;
1210 let mut current_chars = 0;
1211 let mut max_row = 0;
1212
1213 for c in text.chars() {
1214 if c == '\n' {
1215 if current_chars > max_chars {
1216 max_chars = current_chars;
1217 max_row = current_row;
1218 }
1219 current_row += 1;
1220 current_chars = 0;
1221 } else {
1222 current_chars += 1;
1223 }
1224 }
1225
1226 if current_chars > max_chars {
1227 max_chars = current_chars;
1228 max_row = current_row;
1229 }
1230
1231 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1232 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1233 }
1234
1235 #[gpui::test]
1236 fn test_point_utf16_to_offset_clips_to_correct_absolute_offset() {
1237 let text = "abc\nde";
1238 let chunk = Chunk::new(text);
1239 let slice = chunk.as_slice();
1240
1241 // Clipping on row 0 (row_offset_range.start == 0, so relative == absolute)
1242 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(0, 99), true), 3,);
1243
1244 // Clipping on row 1 — this is the case that was buggy.
1245 // Row 1 starts at byte offset 4 ("de" is bytes 4..6), so the
1246 // clipped result must be 6, not 2.
1247 assert_eq!(slice.point_utf16_to_offset(PointUtf16::new(1, 99), true), 6,);
1248 }
1249}