1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35impl Chunk {
36 pub const MASK_BITS: usize = Bitmap::BITS as usize;
37
38 #[inline(always)]
39 pub fn new(text: &str) -> Self {
40 let mut this = Chunk::default();
41 this.push_str(text);
42 this
43 }
44
45 #[inline(always)]
46 pub fn push_str(&mut self, text: &str) {
47 for (char_ix, c) in text.char_indices() {
48 let ix = self.text.len() + char_ix;
49 self.chars |= 1 << ix;
50 self.chars_utf16 |= 1 << ix;
51 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
52 self.newlines |= ((c == '\n') as Bitmap) << ix;
53 self.tabs |= ((c == '\t') as Bitmap) << ix;
54 }
55 self.text.push_str(text);
56 }
57
58 #[inline(always)]
59 pub fn append(&mut self, slice: ChunkSlice) {
60 if slice.is_empty() {
61 return;
62 };
63
64 let base_ix = self.text.len();
65 self.chars |= slice.chars << base_ix;
66 self.chars_utf16 |= slice.chars_utf16 << base_ix;
67 self.newlines |= slice.newlines << base_ix;
68 self.tabs |= slice.tabs << base_ix;
69 self.text.push_str(slice.text);
70 }
71
72 #[inline(always)]
73 pub fn as_slice(&self) -> ChunkSlice<'_> {
74 ChunkSlice {
75 chars: self.chars,
76 chars_utf16: self.chars_utf16,
77 newlines: self.newlines,
78 tabs: self.tabs,
79 text: &self.text,
80 }
81 }
82
83 #[inline(always)]
84 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
85 self.as_slice().slice(range)
86 }
87
88 #[inline(always)]
89 pub fn chars(&self) -> Bitmap {
90 self.chars
91 }
92
93 pub fn tabs(&self) -> Bitmap {
94 self.tabs
95 }
96
97 #[inline(always)]
98 pub fn is_char_boundary(&self, offset: usize) -> bool {
99 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
100 }
101}
102
103#[derive(Clone, Copy, Debug)]
104pub struct ChunkSlice<'a> {
105 chars: Bitmap,
106 chars_utf16: Bitmap,
107 newlines: Bitmap,
108 tabs: Bitmap,
109 text: &'a str,
110}
111
112impl Into<Chunk> for ChunkSlice<'_> {
113 fn into(self) -> Chunk {
114 Chunk {
115 chars: self.chars,
116 chars_utf16: self.chars_utf16,
117 newlines: self.newlines,
118 tabs: self.tabs,
119 text: self.text.try_into().unwrap(),
120 }
121 }
122}
123
124impl<'a> ChunkSlice<'a> {
125 #[inline(always)]
126 pub fn is_empty(&self) -> bool {
127 self.text.is_empty()
128 }
129
130 #[inline(always)]
131 pub fn is_char_boundary(&self, offset: usize) -> bool {
132 (1 as Bitmap).unbounded_shl(offset as u32) & self.chars != 0 || offset == self.text.len()
133 }
134
135 #[inline(always)]
136 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
137 if mid == MAX_BASE {
138 let left = self;
139 let right = ChunkSlice {
140 chars: 0,
141 chars_utf16: 0,
142 newlines: 0,
143 tabs: 0,
144 text: "",
145 };
146 (left, right)
147 } else {
148 let mask = ((1 as Bitmap) << mid) - 1;
149 let (left_text, right_text) = self.text.split_at(mid);
150 let left = ChunkSlice {
151 chars: self.chars & mask,
152 chars_utf16: self.chars_utf16 & mask,
153 newlines: self.newlines & mask,
154 tabs: self.tabs & mask,
155 text: left_text,
156 };
157 let right = ChunkSlice {
158 chars: self.chars >> mid,
159 chars_utf16: self.chars_utf16 >> mid,
160 newlines: self.newlines >> mid,
161 tabs: self.tabs >> mid,
162 text: right_text,
163 };
164 (left, right)
165 }
166 }
167
168 #[inline(always)]
169 pub fn slice(self, range: Range<usize>) -> Self {
170 debug_assert!(
171 self.is_char_boundary(range.end),
172 "Invalid range end {} in {:?}",
173 range.end,
174 self
175 );
176 let mask = (1 as Bitmap)
177 .unbounded_shl(range.end as u32)
178 .wrapping_sub(1);
179 if range.start == MAX_BASE {
180 Self {
181 chars: 0,
182 chars_utf16: 0,
183 newlines: 0,
184 tabs: 0,
185 text: "",
186 }
187 } else {
188 debug_assert!(
189 self.is_char_boundary(range.start),
190 "Invalid range start {} in {:?}",
191 range.start,
192 self
193 );
194 Self {
195 chars: (self.chars & mask) >> range.start,
196 chars_utf16: (self.chars_utf16 & mask) >> range.start,
197 newlines: (self.newlines & mask) >> range.start,
198 tabs: (self.tabs & mask) >> range.start,
199 text: &self.text[range],
200 }
201 }
202 }
203
204 #[inline(always)]
205 pub fn text_summary(&self) -> TextSummary {
206 let mut chars = 0;
207 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
208 TextSummary {
209 len: self.len(),
210 chars,
211 len_utf16: self.len_utf16(),
212 lines: self.lines(),
213 first_line_chars: self.first_line_chars(),
214 last_line_chars: self.last_line_chars(),
215 last_line_len_utf16: self.last_line_len_utf16(),
216 longest_row,
217 longest_row_chars,
218 }
219 }
220
221 /// Get length in bytes
222 #[inline(always)]
223 pub fn len(&self) -> usize {
224 self.text.len()
225 }
226
227 /// Get length in UTF-16 code units
228 #[inline(always)]
229 pub fn len_utf16(&self) -> OffsetUtf16 {
230 OffsetUtf16(self.chars_utf16.count_ones() as usize)
231 }
232
233 /// Get point representing number of lines and length of last line
234 #[inline(always)]
235 pub fn lines(&self) -> Point {
236 let row = self.newlines.count_ones();
237 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
238 Point::new(row, column)
239 }
240
241 /// Get number of chars in first line
242 #[inline(always)]
243 pub fn first_line_chars(&self) -> u32 {
244 if self.newlines == 0 {
245 self.chars.count_ones()
246 } else {
247 let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
248 (self.chars & mask).count_ones()
249 }
250 }
251
252 /// Get number of chars in last line
253 #[inline(always)]
254 pub fn last_line_chars(&self) -> u32 {
255 if self.newlines == 0 {
256 self.chars.count_ones()
257 } else {
258 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
259 (self.chars & mask).count_ones()
260 }
261 }
262
263 /// Get number of UTF-16 code units in last line
264 #[inline(always)]
265 pub fn last_line_len_utf16(&self) -> u32 {
266 if self.newlines == 0 {
267 self.chars_utf16.count_ones()
268 } else {
269 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
270 (self.chars_utf16 & mask).count_ones()
271 }
272 }
273
274 /// Get the longest row in the chunk and its length in characters.
275 /// Calculate the total number of characters in the chunk along the way.
276 #[inline(always)]
277 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
278 let mut chars = self.chars;
279 let mut newlines = self.newlines;
280 *total_chars = 0;
281 let mut row = 0;
282 let mut longest_row = 0;
283 let mut longest_row_chars = 0;
284 while newlines > 0 {
285 let newline_ix = newlines.trailing_zeros();
286 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
287 *total_chars += usize::from(row_chars);
288 if row_chars > longest_row_chars {
289 longest_row = row;
290 longest_row_chars = row_chars;
291 }
292
293 newlines >>= newline_ix;
294 newlines >>= 1;
295 chars >>= newline_ix;
296 chars >>= 1;
297 row += 1;
298 *total_chars += 1;
299 }
300
301 let row_chars = chars.count_ones() as u8;
302 *total_chars += usize::from(row_chars);
303 if row_chars > longest_row_chars {
304 (row, row_chars as u32)
305 } else {
306 (longest_row, longest_row_chars as u32)
307 }
308 }
309
310 #[inline(always)]
311 pub fn offset_to_point(&self, offset: usize) -> Point {
312 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
313 let row = (self.newlines & mask).count_ones();
314 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
315 let column = (offset - newline_ix as usize) as u32;
316 Point::new(row, column)
317 }
318
319 #[inline(always)]
320 pub fn point_to_offset(&self, point: Point) -> usize {
321 if point.row > self.lines().row {
322 debug_panic!(
323 "point {:?} extends beyond rows for string {:?}",
324 point,
325 self.text
326 );
327 return self.len();
328 }
329
330 let row_offset_range = self.offset_range_for_row(point.row);
331 if point.column > row_offset_range.len() as u32 {
332 debug_panic!(
333 "point {:?} extends beyond row for string {:?}",
334 point,
335 self.text
336 );
337 row_offset_range.end
338 } else {
339 row_offset_range.start + point.column as usize
340 }
341 }
342
343 #[inline(always)]
344 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
345 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
346 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
347 }
348
349 #[inline(always)]
350 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
351 if target.0 == 0 {
352 0
353 } else {
354 #[cfg(not(test))]
355 let chars_utf16 = self.chars_utf16;
356 #[cfg(test)]
357 let chars_utf16 = self.chars_utf16 as u128;
358 let ix = nth_set_bit(chars_utf16, target.0) + 1;
359 if ix == MAX_BASE {
360 MAX_BASE
361 } else {
362 let utf8_additional_len = cmp::min(
363 (self.chars_utf16 >> ix).trailing_zeros() as usize,
364 self.text.len() - ix,
365 );
366 ix + utf8_additional_len
367 }
368 }
369 }
370
371 #[inline(always)]
372 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
373 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
374 let row = (self.newlines & mask).count_ones();
375 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
376 let column = if newline_ix as usize == MAX_BASE {
377 0
378 } else {
379 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
380 };
381 PointUtf16::new(row, column)
382 }
383
384 #[inline(always)]
385 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
386 self.offset_to_point_utf16(self.point_to_offset(point))
387 }
388
389 #[inline(always)]
390 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
391 let lines = self.lines();
392 if point.row > lines.row {
393 if !clip {
394 debug_panic!(
395 "point {:?} is beyond this chunk's extent {:?}",
396 point,
397 self.text
398 );
399 }
400 return self.len();
401 }
402
403 let row_offset_range = self.offset_range_for_row(point.row);
404 let line = self.slice(row_offset_range.clone());
405 if point.column > line.last_line_len_utf16() {
406 if !clip {
407 debug_panic!(
408 "point {:?} is beyond the end of the line in chunk {:?}",
409 point,
410 self.text
411 );
412 }
413 return line.len();
414 }
415
416 let mut offset = row_offset_range.start;
417 if point.column > 0 {
418 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
419 if !self.text.is_char_boundary(offset) {
420 offset -= 1;
421 while !self.text.is_char_boundary(offset) {
422 offset -= 1;
423 }
424 if !clip {
425 debug_panic!(
426 "point {:?} is within character in chunk {:?}",
427 point,
428 self.text,
429 );
430 }
431 }
432 }
433 offset
434 }
435
436 #[inline(always)]
437 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
438 let max_point = self.lines();
439 if point.0.row > max_point.row {
440 return max_point;
441 }
442
443 let row_offset_range = self.offset_range_for_row(point.0.row);
444 let line = self.slice(row_offset_range);
445 if point.0.column == 0 {
446 Point::new(point.0.row, 0)
447 } else if point.0.column >= line.len_utf16().0 as u32 {
448 Point::new(point.0.row, line.len() as u32)
449 } else {
450 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
451 while !line.text.is_char_boundary(column) {
452 column -= 1;
453 }
454 Point::new(point.0.row, column as u32)
455 }
456 }
457
458 #[inline(always)]
459 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
460 let max_point = self.lines();
461 if point.row > max_point.row {
462 return max_point;
463 }
464
465 let line = self.slice(self.offset_range_for_row(point.row));
466 if point.column == 0 {
467 point
468 } else if point.column >= line.len() as u32 {
469 Point::new(point.row, line.len() as u32)
470 } else {
471 let mut column = point.column as usize;
472 let bytes = line.text.as_bytes();
473 if bytes[column - 1] < 128 && bytes[column] < 128 {
474 return Point::new(point.row, column as u32);
475 }
476
477 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
478 loop {
479 if line.is_char_boundary(column)
480 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
481 {
482 break;
483 }
484
485 match bias {
486 Bias::Left => column -= 1,
487 Bias::Right => column += 1,
488 }
489 grapheme_cursor.set_cursor(column);
490 }
491 Point::new(point.row, column as u32)
492 }
493 }
494
495 #[inline(always)]
496 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
497 let max_point = self.lines();
498 if point.0.row > max_point.row {
499 PointUtf16::new(max_point.row, self.last_line_len_utf16())
500 } else {
501 let line = self.slice(self.offset_range_for_row(point.0.row));
502 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
503 PointUtf16::new(point.0.row, column.0 as u32)
504 }
505 }
506
507 #[inline(always)]
508 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
509 if target == OffsetUtf16::default() {
510 OffsetUtf16::default()
511 } else if target >= self.len_utf16() {
512 self.len_utf16()
513 } else {
514 let mut offset = self.offset_utf16_to_offset(target);
515 while !self.text.is_char_boundary(offset) {
516 if bias == Bias::Left {
517 offset -= 1;
518 } else {
519 offset += 1;
520 }
521 }
522 self.offset_to_offset_utf16(offset)
523 }
524 }
525
526 #[inline(always)]
527 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
528 let row_start = if row > 0 {
529 #[cfg(not(test))]
530 let newlines = self.newlines;
531 #[cfg(test)]
532 let newlines = self.newlines as u128;
533 nth_set_bit(newlines, row as usize) + 1
534 } else {
535 0
536 };
537 let row_len = if row_start == MAX_BASE {
538 0
539 } else {
540 cmp::min(
541 (self.newlines >> row_start).trailing_zeros(),
542 (self.text.len() - row_start) as u32,
543 )
544 };
545 row_start..row_start + row_len as usize
546 }
547
548 #[inline(always)]
549 pub fn tabs(&self) -> Tabs {
550 Tabs {
551 tabs: self.tabs,
552 chars: self.chars,
553 }
554 }
555}
556
557pub struct Tabs {
558 tabs: Bitmap,
559 chars: Bitmap,
560}
561
562#[derive(Debug, PartialEq, Eq)]
563pub struct TabPosition {
564 pub byte_offset: usize,
565 pub char_offset: usize,
566}
567
568impl Iterator for Tabs {
569 type Item = TabPosition;
570
571 fn next(&mut self) -> Option<Self::Item> {
572 if self.tabs == 0 {
573 return None;
574 }
575
576 let tab_offset = self.tabs.trailing_zeros() as usize;
577 let chars_mask = (1 << tab_offset) - 1;
578 let char_offset = (self.chars & chars_mask).count_ones() as usize;
579
580 // Since tabs are 1 byte the tab offset is the same as the byte offset
581 let position = TabPosition {
582 byte_offset: tab_offset,
583 char_offset,
584 };
585 // Remove the tab we've just seen
586 self.tabs ^= 1 << tab_offset;
587
588 Some(position)
589 }
590}
591
592/// Finds the n-th bit that is set to 1.
593#[inline(always)]
594fn nth_set_bit(v: u128, n: usize) -> usize {
595 let low = v as u64;
596 let high = (v >> 64) as u64;
597
598 let low_count = low.count_ones() as usize;
599 if n > low_count {
600 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
601 } else {
602 nth_set_bit_u64(low, n as u64) as usize
603 }
604}
605
606#[inline(always)]
607fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
608 let v = v.reverse_bits();
609 let mut s: u64 = 64;
610
611 // Parallel bit count intermediates
612 let a = v - ((v >> 1) & (u64::MAX / 3));
613 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
614 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
615 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
616
617 // Branchless select
618 let t = (d >> 32) + (d >> 48);
619 s -= (t.wrapping_sub(n) & 256) >> 3;
620 n -= t & (t.wrapping_sub(n) >> 8);
621
622 let t = (d >> (s - 16)) & 0xff;
623 s -= (t.wrapping_sub(n) & 256) >> 4;
624 n -= t & (t.wrapping_sub(n) >> 8);
625
626 let t = (c >> (s - 8)) & 0xf;
627 s -= (t.wrapping_sub(n) & 256) >> 5;
628 n -= t & (t.wrapping_sub(n) >> 8);
629
630 let t = (b >> (s - 4)) & 0x7;
631 s -= (t.wrapping_sub(n) & 256) >> 6;
632 n -= t & (t.wrapping_sub(n) >> 8);
633
634 let t = (a >> (s - 2)) & 0x3;
635 s -= (t.wrapping_sub(n) & 256) >> 7;
636 n -= t & (t.wrapping_sub(n) >> 8);
637
638 let t = (v >> (s - 1)) & 0x1;
639 s -= (t.wrapping_sub(n) & 256) >> 8;
640
641 65 - s - 1
642}
643
644#[cfg(test)]
645mod tests {
646 use super::*;
647 use rand::prelude::*;
648 use util::RandomCharIter;
649
650 #[gpui::test(iterations = 100)]
651 fn test_random_chunks(mut rng: StdRng) {
652 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
653 log::info!("Chunk: {:?}", text);
654 let chunk = Chunk::new(&text);
655 verify_chunk(chunk.as_slice(), &text);
656
657 // Verify Chunk::chars() bitmap
658 let expected_chars = char_offsets(&text)
659 .into_iter()
660 .inspect(|i| assert!(*i < MAX_BASE))
661 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
662 assert_eq!(chunk.chars(), expected_chars);
663
664 for _ in 0..10 {
665 let mut start = rng.random_range(0..=chunk.text.len());
666 let mut end = rng.random_range(start..=chunk.text.len());
667 while !chunk.text.is_char_boundary(start) {
668 start -= 1;
669 }
670 while !chunk.text.is_char_boundary(end) {
671 end -= 1;
672 }
673 let range = start..end;
674 log::info!("Range: {:?}", range);
675 let text_slice = &text[range.clone()];
676 let chunk_slice = chunk.slice(range);
677 verify_chunk(chunk_slice, text_slice);
678 }
679 }
680
681 #[gpui::test(iterations = 100)]
682 fn test_split_chunk_slice(mut rng: StdRng) {
683 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
684 let chunk = Chunk::new(text);
685 let offset = char_offsets_with_end(text)
686 .into_iter()
687 .choose(&mut rng)
688 .unwrap();
689 let (a, b) = chunk.as_slice().split_at(offset);
690 let (a_str, b_str) = text.split_at(offset);
691 verify_chunk(a, a_str);
692 verify_chunk(b, b_str);
693 }
694
695 #[gpui::test(iterations = 1000)]
696 fn test_nth_set_bit_random(mut rng: StdRng) {
697 let set_count = rng.random_range(0..=128);
698 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
699 set_bits.sort();
700 let mut n = 0;
701 for ix in set_bits.iter().copied() {
702 n |= 1 << ix;
703 }
704
705 for (mut ix, position) in set_bits.into_iter().enumerate() {
706 ix += 1;
707 assert_eq!(
708 nth_set_bit(n, ix),
709 position,
710 "nth_set_bit({:0128b}, {})",
711 n,
712 ix
713 );
714 }
715 }
716
717 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
718 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
719 let mut str = String::new();
720 let mut chars = RandomCharIter::new(rng);
721 loop {
722 let ch = chars.next().unwrap();
723 if str.len() + ch.len_utf8() > len {
724 break;
725 }
726 str.push(ch);
727 }
728 str
729 }
730
731 #[gpui::test(iterations = 1000)]
732 fn test_append_random_strings(mut rng: StdRng) {
733 let len1 = rng.random_range(0..=MAX_BASE);
734 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
735 let str1 = random_string_with_utf8_len(&mut rng, len1);
736 let str2 = random_string_with_utf8_len(&mut rng, len2);
737 let mut chunk1 = Chunk::new(&str1);
738 let chunk2 = Chunk::new(&str2);
739 let char_offsets = char_offsets_with_end(&str2);
740 let start_index = rng.random_range(0..char_offsets.len());
741 let start_offset = char_offsets[start_index];
742 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
743 chunk1.append(chunk2.slice(start_offset..end_offset));
744 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
745 }
746
747 /// Return the byte offsets for each character in a string.
748 ///
749 /// These are valid offsets to split the string.
750 fn char_offsets(text: &str) -> Vec<usize> {
751 text.char_indices().map(|(i, _c)| i).collect()
752 }
753
754 /// Return the byte offsets for each character in a string, plus the offset
755 /// past the end of the string.
756 fn char_offsets_with_end(text: &str) -> Vec<usize> {
757 let mut v = char_offsets(text);
758 v.push(text.len());
759 v
760 }
761
762 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
763 let mut offset = 0;
764 let mut offset_utf16 = OffsetUtf16(0);
765 let mut point = Point::zero();
766 let mut point_utf16 = PointUtf16::zero();
767
768 log::info!("Verifying chunk {:?}", text);
769 assert_eq!(chunk.offset_to_point(0), Point::zero());
770
771 let mut expected_tab_positions = Vec::new();
772
773 for (char_offset, c) in text.chars().enumerate() {
774 let expected_point = chunk.offset_to_point(offset);
775 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
776 assert_eq!(
777 chunk.point_to_offset(point),
778 offset,
779 "mismatch at point {:?}",
780 point
781 );
782 assert_eq!(
783 chunk.offset_to_offset_utf16(offset),
784 offset_utf16,
785 "mismatch at offset {}",
786 offset
787 );
788 assert_eq!(
789 chunk.offset_utf16_to_offset(offset_utf16),
790 offset,
791 "mismatch at offset_utf16 {:?}",
792 offset_utf16
793 );
794 assert_eq!(
795 chunk.point_to_point_utf16(point),
796 point_utf16,
797 "mismatch at point {:?}",
798 point
799 );
800 assert_eq!(
801 chunk.point_utf16_to_offset(point_utf16, false),
802 offset,
803 "mismatch at point_utf16 {:?}",
804 point_utf16
805 );
806 assert_eq!(
807 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
808 point,
809 "mismatch for unclipped_point_utf16_to_point at {:?}",
810 point_utf16
811 );
812
813 assert_eq!(
814 chunk.clip_point(point, Bias::Left),
815 point,
816 "incorrect left clip at {:?}",
817 point
818 );
819 assert_eq!(
820 chunk.clip_point(point, Bias::Right),
821 point,
822 "incorrect right clip at {:?}",
823 point
824 );
825
826 for i in 1..c.len_utf8() {
827 let test_point = Point::new(point.row, point.column + i as u32);
828 assert_eq!(
829 chunk.clip_point(test_point, Bias::Left),
830 point,
831 "incorrect left clip within multi-byte char at {:?}",
832 test_point
833 );
834 assert_eq!(
835 chunk.clip_point(test_point, Bias::Right),
836 Point::new(point.row, point.column + c.len_utf8() as u32),
837 "incorrect right clip within multi-byte char at {:?}",
838 test_point
839 );
840 }
841
842 for i in 1..c.len_utf16() {
843 let test_point = Unclipped(PointUtf16::new(
844 point_utf16.row,
845 point_utf16.column + i as u32,
846 ));
847 assert_eq!(
848 chunk.unclipped_point_utf16_to_point(test_point),
849 point,
850 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
851 test_point
852 );
853 assert_eq!(
854 chunk.clip_point_utf16(test_point, Bias::Left),
855 point_utf16,
856 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
857 test_point
858 );
859 assert_eq!(
860 chunk.clip_point_utf16(test_point, Bias::Right),
861 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
862 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
863 test_point
864 );
865
866 let test_offset = OffsetUtf16(offset_utf16.0 + i);
867 assert_eq!(
868 chunk.clip_offset_utf16(test_offset, Bias::Left),
869 offset_utf16,
870 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
871 test_offset
872 );
873 assert_eq!(
874 chunk.clip_offset_utf16(test_offset, Bias::Right),
875 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
876 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
877 test_offset
878 );
879 }
880
881 if c == '\n' {
882 point.row += 1;
883 point.column = 0;
884 point_utf16.row += 1;
885 point_utf16.column = 0;
886 } else {
887 point.column += c.len_utf8() as u32;
888 point_utf16.column += c.len_utf16() as u32;
889 }
890
891 if c == '\t' {
892 expected_tab_positions.push(TabPosition {
893 byte_offset: offset,
894 char_offset,
895 });
896 }
897
898 offset += c.len_utf8();
899 offset_utf16.0 += c.len_utf16();
900 }
901
902 let final_point = chunk.offset_to_point(offset);
903 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
904 assert_eq!(
905 chunk.point_to_offset(point),
906 offset,
907 "mismatch at point {:?}",
908 point
909 );
910 assert_eq!(
911 chunk.offset_to_offset_utf16(offset),
912 offset_utf16,
913 "mismatch at offset {}",
914 offset
915 );
916 assert_eq!(
917 chunk.offset_utf16_to_offset(offset_utf16),
918 offset,
919 "mismatch at offset_utf16 {:?}",
920 offset_utf16
921 );
922 assert_eq!(
923 chunk.point_to_point_utf16(point),
924 point_utf16,
925 "mismatch at final point {:?}",
926 point
927 );
928 assert_eq!(
929 chunk.point_utf16_to_offset(point_utf16, false),
930 offset,
931 "mismatch at final point_utf16 {:?}",
932 point_utf16
933 );
934 assert_eq!(
935 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
936 point,
937 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
938 point_utf16
939 );
940 assert_eq!(
941 chunk.clip_point(point, Bias::Left),
942 point,
943 "incorrect left clip at final point {:?}",
944 point
945 );
946 assert_eq!(
947 chunk.clip_point(point, Bias::Right),
948 point,
949 "incorrect right clip at final point {:?}",
950 point
951 );
952 assert_eq!(
953 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
954 point_utf16,
955 "incorrect left clip_point_utf16 at final point {:?}",
956 point_utf16
957 );
958 assert_eq!(
959 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
960 point_utf16,
961 "incorrect right clip_point_utf16 at final point {:?}",
962 point_utf16
963 );
964 assert_eq!(
965 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
966 offset_utf16,
967 "incorrect left clip_offset_utf16 at final offset {:?}",
968 offset_utf16
969 );
970 assert_eq!(
971 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
972 offset_utf16,
973 "incorrect right clip_offset_utf16 at final offset {:?}",
974 offset_utf16
975 );
976
977 // Verify length methods
978 assert_eq!(chunk.len(), text.len());
979 assert_eq!(
980 chunk.len_utf16().0,
981 text.chars().map(|c| c.len_utf16()).sum::<usize>()
982 );
983
984 // Verify line counting
985 let lines = chunk.lines();
986 let mut newline_count = 0;
987 let mut last_line_len = 0;
988 for c in text.chars() {
989 if c == '\n' {
990 newline_count += 1;
991 last_line_len = 0;
992 } else {
993 last_line_len += c.len_utf8() as u32;
994 }
995 }
996 assert_eq!(lines, Point::new(newline_count, last_line_len));
997
998 // Verify first/last line chars
999 if !text.is_empty() {
1000 let first_line = text.split('\n').next().unwrap();
1001 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
1002
1003 let last_line = text.split('\n').next_back().unwrap();
1004 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1005 assert_eq!(
1006 chunk.last_line_len_utf16(),
1007 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1008 );
1009 }
1010
1011 // Verify longest row
1012 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1013 let mut max_chars = 0;
1014 let mut current_row = 0;
1015 let mut current_chars = 0;
1016 let mut max_row = 0;
1017
1018 for c in text.chars() {
1019 if c == '\n' {
1020 if current_chars > max_chars {
1021 max_chars = current_chars;
1022 max_row = current_row;
1023 }
1024 current_row += 1;
1025 current_chars = 0;
1026 } else {
1027 current_chars += 1;
1028 }
1029 }
1030
1031 if current_chars > max_chars {
1032 max_chars = current_chars;
1033 max_row = current_row;
1034 }
1035
1036 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1037 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1038 }
1039}