1use crate::{OffsetUtf16, Point, PointUtf16, TextSummary, Unclipped};
2use arrayvec::ArrayString;
3use std::{cmp, ops::Range};
4use sum_tree::Bias;
5use unicode_segmentation::GraphemeCursor;
6use util::debug_panic;
7
8#[cfg(not(all(test, not(rust_analyzer))))]
9pub(crate) type Bitmap = u128;
10#[cfg(all(test, not(rust_analyzer)))]
11pub(crate) type Bitmap = u16;
12
13pub(crate) const MIN_BASE: usize = MAX_BASE / 2;
14pub(crate) const MAX_BASE: usize = Bitmap::BITS as usize;
15
16#[derive(Clone, Debug, Default)]
17pub struct Chunk {
18 /// If bit[i] is set, then the character at index i is the start of a UTF-8 character in the
19 /// text.
20 chars: Bitmap,
21 /// The number of set bits is the number of UTF-16 code units it would take to represent the
22 /// text.
23 ///
24 /// Bit[i] is set if text[i] is the start of a UTF-8 character. If the character would
25 /// take two UTF-16 code units, then bit[i+1] is also set. (Rust chars never take more
26 /// than two UTF-16 code units.)
27 chars_utf16: Bitmap,
28 /// If bit[i] is set, then the character at index i is an ascii newline.
29 newlines: Bitmap,
30 /// If bit[i] is set, then the character at index i is an ascii tab.
31 tabs: Bitmap,
32 pub text: ArrayString<MAX_BASE>,
33}
34
35impl Chunk {
36 pub const MASK_BITS: usize = Bitmap::BITS as usize;
37
38 #[inline(always)]
39 pub fn new(text: &str) -> Self {
40 let mut this = Chunk::default();
41 this.push_str(text);
42 this
43 }
44
45 #[inline(always)]
46 pub fn push_str(&mut self, text: &str) {
47 for (char_ix, c) in text.char_indices() {
48 let ix = self.text.len() + char_ix;
49 self.chars |= 1 << ix;
50 self.chars_utf16 |= 1 << ix;
51 self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix;
52 self.newlines |= ((c == '\n') as Bitmap) << ix;
53 self.tabs |= ((c == '\t') as Bitmap) << ix;
54 }
55 self.text.push_str(text);
56 }
57
58 #[inline(always)]
59 pub fn append(&mut self, slice: ChunkSlice) {
60 if slice.is_empty() {
61 return;
62 };
63
64 let base_ix = self.text.len();
65 self.chars |= slice.chars << base_ix;
66 self.chars_utf16 |= slice.chars_utf16 << base_ix;
67 self.newlines |= slice.newlines << base_ix;
68 self.tabs |= slice.tabs << base_ix;
69 self.text.push_str(slice.text);
70 }
71
72 #[inline(always)]
73 pub fn as_slice(&self) -> ChunkSlice<'_> {
74 ChunkSlice {
75 chars: self.chars,
76 chars_utf16: self.chars_utf16,
77 newlines: self.newlines,
78 tabs: self.tabs,
79 text: &self.text,
80 }
81 }
82
83 #[inline(always)]
84 pub fn slice(&self, range: Range<usize>) -> ChunkSlice<'_> {
85 self.as_slice().slice(range)
86 }
87
88 #[inline(always)]
89 pub fn chars(&self) -> Bitmap {
90 self.chars
91 }
92
93 pub fn tabs(&self) -> Bitmap {
94 self.tabs
95 }
96}
97
98#[derive(Clone, Copy, Debug)]
99pub struct ChunkSlice<'a> {
100 chars: Bitmap,
101 chars_utf16: Bitmap,
102 newlines: Bitmap,
103 tabs: Bitmap,
104 text: &'a str,
105}
106
107impl Into<Chunk> for ChunkSlice<'_> {
108 fn into(self) -> Chunk {
109 Chunk {
110 chars: self.chars,
111 chars_utf16: self.chars_utf16,
112 newlines: self.newlines,
113 tabs: self.tabs,
114 text: self.text.try_into().unwrap(),
115 }
116 }
117}
118
119impl<'a> ChunkSlice<'a> {
120 #[inline(always)]
121 pub fn is_empty(&self) -> bool {
122 self.text.is_empty()
123 }
124
125 #[inline(always)]
126 pub fn is_char_boundary(self, offset: usize) -> bool {
127 self.text.is_char_boundary(offset)
128 }
129
130 #[inline(always)]
131 pub fn split_at(self, mid: usize) -> (ChunkSlice<'a>, ChunkSlice<'a>) {
132 if mid == MAX_BASE {
133 let left = self;
134 let right = ChunkSlice {
135 chars: 0,
136 chars_utf16: 0,
137 newlines: 0,
138 tabs: 0,
139 text: "",
140 };
141 (left, right)
142 } else {
143 let mask = ((1 as Bitmap) << mid) - 1;
144 let (left_text, right_text) = self.text.split_at(mid);
145 let left = ChunkSlice {
146 chars: self.chars & mask,
147 chars_utf16: self.chars_utf16 & mask,
148 newlines: self.newlines & mask,
149 tabs: self.tabs & mask,
150 text: left_text,
151 };
152 let right = ChunkSlice {
153 chars: self.chars >> mid,
154 chars_utf16: self.chars_utf16 >> mid,
155 newlines: self.newlines >> mid,
156 tabs: self.tabs >> mid,
157 text: right_text,
158 };
159 (left, right)
160 }
161 }
162
163 #[inline(always)]
164 pub fn slice(self, range: Range<usize>) -> Self {
165 debug_assert!(
166 self.is_char_boundary(range.end),
167 "Invalid range end {} in {:?}",
168 range.end,
169 self
170 );
171 let mask = (1 as Bitmap)
172 .unbounded_shl(range.end as u32)
173 .wrapping_sub(1);
174 if range.start == MAX_BASE {
175 Self {
176 chars: 0,
177 chars_utf16: 0,
178 newlines: 0,
179 tabs: 0,
180 text: "",
181 }
182 } else {
183 debug_assert!(
184 self.is_char_boundary(range.start),
185 "Invalid range start {} in {:?}",
186 range.start,
187 self
188 );
189 Self {
190 chars: (self.chars & mask) >> range.start,
191 chars_utf16: (self.chars_utf16 & mask) >> range.start,
192 newlines: (self.newlines & mask) >> range.start,
193 tabs: (self.tabs & mask) >> range.start,
194 text: &self.text[range],
195 }
196 }
197 }
198
199 #[inline(always)]
200 pub fn text_summary(&self) -> TextSummary {
201 let mut chars = 0;
202 let (longest_row, longest_row_chars) = self.longest_row(&mut chars);
203 TextSummary {
204 len: self.len(),
205 chars,
206 len_utf16: self.len_utf16(),
207 lines: self.lines(),
208 first_line_chars: self.first_line_chars(),
209 last_line_chars: self.last_line_chars(),
210 last_line_len_utf16: self.last_line_len_utf16(),
211 longest_row,
212 longest_row_chars,
213 }
214 }
215
216 /// Get length in bytes
217 #[inline(always)]
218 pub fn len(&self) -> usize {
219 self.text.len()
220 }
221
222 /// Get length in UTF-16 code units
223 #[inline(always)]
224 pub fn len_utf16(&self) -> OffsetUtf16 {
225 OffsetUtf16(self.chars_utf16.count_ones() as usize)
226 }
227
228 /// Get point representing number of lines and length of last line
229 #[inline(always)]
230 pub fn lines(&self) -> Point {
231 let row = self.newlines.count_ones();
232 let column = self.newlines.leading_zeros() - (Bitmap::BITS - self.text.len() as u32);
233 Point::new(row, column)
234 }
235
236 /// Get number of chars in first line
237 #[inline(always)]
238 pub fn first_line_chars(&self) -> u32 {
239 if self.newlines == 0 {
240 self.chars.count_ones()
241 } else {
242 let mask = ((1 as Bitmap) << self.newlines.trailing_zeros()) - 1;
243 (self.chars & mask).count_ones()
244 }
245 }
246
247 /// Get number of chars in last line
248 #[inline(always)]
249 pub fn last_line_chars(&self) -> u32 {
250 if self.newlines == 0 {
251 self.chars.count_ones()
252 } else {
253 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
254 (self.chars & mask).count_ones()
255 }
256 }
257
258 /// Get number of UTF-16 code units in last line
259 #[inline(always)]
260 pub fn last_line_len_utf16(&self) -> u32 {
261 if self.newlines == 0 {
262 self.chars_utf16.count_ones()
263 } else {
264 let mask = !(Bitmap::MAX >> self.newlines.leading_zeros());
265 (self.chars_utf16 & mask).count_ones()
266 }
267 }
268
269 /// Get the longest row in the chunk and its length in characters.
270 /// Calculate the total number of characters in the chunk along the way.
271 #[inline(always)]
272 pub fn longest_row(&self, total_chars: &mut usize) -> (u32, u32) {
273 let mut chars = self.chars;
274 let mut newlines = self.newlines;
275 *total_chars = 0;
276 let mut row = 0;
277 let mut longest_row = 0;
278 let mut longest_row_chars = 0;
279 while newlines > 0 {
280 let newline_ix = newlines.trailing_zeros();
281 let row_chars = (chars & ((1 << newline_ix) - 1)).count_ones() as u8;
282 *total_chars += usize::from(row_chars);
283 if row_chars > longest_row_chars {
284 longest_row = row;
285 longest_row_chars = row_chars;
286 }
287
288 newlines >>= newline_ix;
289 newlines >>= 1;
290 chars >>= newline_ix;
291 chars >>= 1;
292 row += 1;
293 *total_chars += 1;
294 }
295
296 let row_chars = chars.count_ones() as u8;
297 *total_chars += usize::from(row_chars);
298 if row_chars > longest_row_chars {
299 (row, row_chars as u32)
300 } else {
301 (longest_row, longest_row_chars as u32)
302 }
303 }
304
305 #[inline(always)]
306 pub fn offset_to_point(&self, offset: usize) -> Point {
307 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
308 let row = (self.newlines & mask).count_ones();
309 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
310 let column = (offset - newline_ix as usize) as u32;
311 Point::new(row, column)
312 }
313
314 #[inline(always)]
315 pub fn point_to_offset(&self, point: Point) -> usize {
316 if point.row > self.lines().row {
317 debug_panic!(
318 "point {:?} extends beyond rows for string {:?}",
319 point,
320 self.text
321 );
322 return self.len();
323 }
324
325 let row_offset_range = self.offset_range_for_row(point.row);
326 if point.column > row_offset_range.len() as u32 {
327 debug_panic!(
328 "point {:?} extends beyond row for string {:?}",
329 point,
330 self.text
331 );
332 row_offset_range.end
333 } else {
334 row_offset_range.start + point.column as usize
335 }
336 }
337
338 #[inline(always)]
339 pub fn offset_to_offset_utf16(&self, offset: usize) -> OffsetUtf16 {
340 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
341 OffsetUtf16((self.chars_utf16 & mask).count_ones() as usize)
342 }
343
344 #[inline(always)]
345 pub fn offset_utf16_to_offset(&self, target: OffsetUtf16) -> usize {
346 if target.0 == 0 {
347 0
348 } else {
349 #[cfg(not(test))]
350 let chars_utf16 = self.chars_utf16;
351 #[cfg(test)]
352 let chars_utf16 = self.chars_utf16 as u128;
353 let ix = nth_set_bit(chars_utf16, target.0) + 1;
354 if ix == MAX_BASE {
355 MAX_BASE
356 } else {
357 let utf8_additional_len = cmp::min(
358 (self.chars_utf16 >> ix).trailing_zeros() as usize,
359 self.text.len() - ix,
360 );
361 ix + utf8_additional_len
362 }
363 }
364 }
365
366 #[inline(always)]
367 pub fn offset_to_point_utf16(&self, offset: usize) -> PointUtf16 {
368 let mask = (1 as Bitmap).unbounded_shl(offset as u32).wrapping_sub(1);
369 let row = (self.newlines & mask).count_ones();
370 let newline_ix = Bitmap::BITS - (self.newlines & mask).leading_zeros();
371 let column = if newline_ix as usize == MAX_BASE {
372 0
373 } else {
374 ((self.chars_utf16 & mask) >> newline_ix).count_ones()
375 };
376 PointUtf16::new(row, column)
377 }
378
379 #[inline(always)]
380 pub fn point_to_point_utf16(&self, point: Point) -> PointUtf16 {
381 self.offset_to_point_utf16(self.point_to_offset(point))
382 }
383
384 #[inline(always)]
385 pub fn point_utf16_to_offset(&self, point: PointUtf16, clip: bool) -> usize {
386 let lines = self.lines();
387 if point.row > lines.row {
388 if !clip {
389 debug_panic!(
390 "point {:?} is beyond this chunk's extent {:?}",
391 point,
392 self.text
393 );
394 }
395 return self.len();
396 }
397
398 let row_offset_range = self.offset_range_for_row(point.row);
399 let line = self.slice(row_offset_range.clone());
400 if point.column > line.last_line_len_utf16() {
401 if !clip {
402 debug_panic!(
403 "point {:?} is beyond the end of the line in chunk {:?}",
404 point,
405 self.text
406 );
407 }
408 return line.len();
409 }
410
411 let mut offset = row_offset_range.start;
412 if point.column > 0 {
413 offset += line.offset_utf16_to_offset(OffsetUtf16(point.column as usize));
414 if !self.text.is_char_boundary(offset) {
415 offset -= 1;
416 while !self.text.is_char_boundary(offset) {
417 offset -= 1;
418 }
419 if !clip {
420 debug_panic!(
421 "point {:?} is within character in chunk {:?}",
422 point,
423 self.text,
424 );
425 }
426 }
427 }
428 offset
429 }
430
431 #[inline(always)]
432 pub fn unclipped_point_utf16_to_point(&self, point: Unclipped<PointUtf16>) -> Point {
433 let max_point = self.lines();
434 if point.0.row > max_point.row {
435 return max_point;
436 }
437
438 let row_offset_range = self.offset_range_for_row(point.0.row);
439 let line = self.slice(row_offset_range);
440 if point.0.column == 0 {
441 Point::new(point.0.row, 0)
442 } else if point.0.column >= line.len_utf16().0 as u32 {
443 Point::new(point.0.row, line.len() as u32)
444 } else {
445 let mut column = line.offset_utf16_to_offset(OffsetUtf16(point.0.column as usize));
446 while !line.text.is_char_boundary(column) {
447 column -= 1;
448 }
449 Point::new(point.0.row, column as u32)
450 }
451 }
452
453 #[inline(always)]
454 pub fn clip_point(&self, point: Point, bias: Bias) -> Point {
455 let max_point = self.lines();
456 if point.row > max_point.row {
457 return max_point;
458 }
459
460 let line = self.slice(self.offset_range_for_row(point.row));
461 if point.column == 0 {
462 point
463 } else if point.column >= line.len() as u32 {
464 Point::new(point.row, line.len() as u32)
465 } else {
466 let mut column = point.column as usize;
467 let bytes = line.text.as_bytes();
468 if bytes[column - 1] < 128 && bytes[column] < 128 {
469 return Point::new(point.row, column as u32);
470 }
471
472 let mut grapheme_cursor = GraphemeCursor::new(column, bytes.len(), true);
473 loop {
474 if line.is_char_boundary(column)
475 && grapheme_cursor.is_boundary(line.text, 0).unwrap_or(false)
476 {
477 break;
478 }
479
480 match bias {
481 Bias::Left => column -= 1,
482 Bias::Right => column += 1,
483 }
484 grapheme_cursor.set_cursor(column);
485 }
486 Point::new(point.row, column as u32)
487 }
488 }
489
490 #[inline(always)]
491 pub fn clip_point_utf16(&self, point: Unclipped<PointUtf16>, bias: Bias) -> PointUtf16 {
492 let max_point = self.lines();
493 if point.0.row > max_point.row {
494 PointUtf16::new(max_point.row, self.last_line_len_utf16())
495 } else {
496 let line = self.slice(self.offset_range_for_row(point.0.row));
497 let column = line.clip_offset_utf16(OffsetUtf16(point.0.column as usize), bias);
498 PointUtf16::new(point.0.row, column.0 as u32)
499 }
500 }
501
502 #[inline(always)]
503 pub fn clip_offset_utf16(&self, target: OffsetUtf16, bias: Bias) -> OffsetUtf16 {
504 if target == OffsetUtf16::default() {
505 OffsetUtf16::default()
506 } else if target >= self.len_utf16() {
507 self.len_utf16()
508 } else {
509 let mut offset = self.offset_utf16_to_offset(target);
510 while !self.text.is_char_boundary(offset) {
511 if bias == Bias::Left {
512 offset -= 1;
513 } else {
514 offset += 1;
515 }
516 }
517 self.offset_to_offset_utf16(offset)
518 }
519 }
520
521 #[inline(always)]
522 fn offset_range_for_row(&self, row: u32) -> Range<usize> {
523 let row_start = if row > 0 {
524 #[cfg(not(test))]
525 let newlines = self.newlines;
526 #[cfg(test)]
527 let newlines = self.newlines as u128;
528 nth_set_bit(newlines, row as usize) + 1
529 } else {
530 0
531 };
532 let row_len = if row_start == MAX_BASE {
533 0
534 } else {
535 cmp::min(
536 (self.newlines >> row_start).trailing_zeros(),
537 (self.text.len() - row_start) as u32,
538 )
539 };
540 row_start..row_start + row_len as usize
541 }
542
543 #[inline(always)]
544 pub fn tabs(&self) -> Tabs {
545 Tabs {
546 tabs: self.tabs,
547 chars: self.chars,
548 }
549 }
550}
551
552pub struct Tabs {
553 tabs: Bitmap,
554 chars: Bitmap,
555}
556
557#[derive(Debug, PartialEq, Eq)]
558pub struct TabPosition {
559 pub byte_offset: usize,
560 pub char_offset: usize,
561}
562
563impl Iterator for Tabs {
564 type Item = TabPosition;
565
566 fn next(&mut self) -> Option<Self::Item> {
567 if self.tabs == 0 {
568 return None;
569 }
570
571 let tab_offset = self.tabs.trailing_zeros() as usize;
572 let chars_mask = (1 << tab_offset) - 1;
573 let char_offset = (self.chars & chars_mask).count_ones() as usize;
574
575 // Since tabs are 1 byte the tab offset is the same as the byte offset
576 let position = TabPosition {
577 byte_offset: tab_offset,
578 char_offset,
579 };
580 // Remove the tab we've just seen
581 self.tabs ^= 1 << tab_offset;
582
583 Some(position)
584 }
585}
586
587/// Finds the n-th bit that is set to 1.
588#[inline(always)]
589fn nth_set_bit(v: u128, n: usize) -> usize {
590 let low = v as u64;
591 let high = (v >> 64) as u64;
592
593 let low_count = low.count_ones() as usize;
594 if n > low_count {
595 64 + nth_set_bit_u64(high, (n - low_count) as u64) as usize
596 } else {
597 nth_set_bit_u64(low, n as u64) as usize
598 }
599}
600
601#[inline(always)]
602fn nth_set_bit_u64(v: u64, mut n: u64) -> u64 {
603 let v = v.reverse_bits();
604 let mut s: u64 = 64;
605
606 // Parallel bit count intermediates
607 let a = v - ((v >> 1) & (u64::MAX / 3));
608 let b = (a & (u64::MAX / 5)) + ((a >> 2) & (u64::MAX / 5));
609 let c = (b + (b >> 4)) & (u64::MAX / 0x11);
610 let d = (c + (c >> 8)) & (u64::MAX / 0x101);
611
612 // Branchless select
613 let t = (d >> 32) + (d >> 48);
614 s -= (t.wrapping_sub(n) & 256) >> 3;
615 n -= t & (t.wrapping_sub(n) >> 8);
616
617 let t = (d >> (s - 16)) & 0xff;
618 s -= (t.wrapping_sub(n) & 256) >> 4;
619 n -= t & (t.wrapping_sub(n) >> 8);
620
621 let t = (c >> (s - 8)) & 0xf;
622 s -= (t.wrapping_sub(n) & 256) >> 5;
623 n -= t & (t.wrapping_sub(n) >> 8);
624
625 let t = (b >> (s - 4)) & 0x7;
626 s -= (t.wrapping_sub(n) & 256) >> 6;
627 n -= t & (t.wrapping_sub(n) >> 8);
628
629 let t = (a >> (s - 2)) & 0x3;
630 s -= (t.wrapping_sub(n) & 256) >> 7;
631 n -= t & (t.wrapping_sub(n) >> 8);
632
633 let t = (v >> (s - 1)) & 0x1;
634 s -= (t.wrapping_sub(n) & 256) >> 8;
635
636 65 - s - 1
637}
638
639#[cfg(test)]
640mod tests {
641 use super::*;
642 use rand::prelude::*;
643 use util::RandomCharIter;
644
645 #[gpui::test(iterations = 100)]
646 fn test_random_chunks(mut rng: StdRng) {
647 let text = random_string_with_utf8_len(&mut rng, MAX_BASE);
648 log::info!("Chunk: {:?}", text);
649 let chunk = Chunk::new(&text);
650 verify_chunk(chunk.as_slice(), &text);
651
652 // Verify Chunk::chars() bitmap
653 let expected_chars = char_offsets(&text)
654 .into_iter()
655 .inspect(|i| assert!(*i < MAX_BASE))
656 .fold(0 as Bitmap, |acc, i| acc | (1 << i));
657 assert_eq!(chunk.chars(), expected_chars);
658
659 for _ in 0..10 {
660 let mut start = rng.random_range(0..=chunk.text.len());
661 let mut end = rng.random_range(start..=chunk.text.len());
662 while !chunk.text.is_char_boundary(start) {
663 start -= 1;
664 }
665 while !chunk.text.is_char_boundary(end) {
666 end -= 1;
667 }
668 let range = start..end;
669 log::info!("Range: {:?}", range);
670 let text_slice = &text[range.clone()];
671 let chunk_slice = chunk.slice(range);
672 verify_chunk(chunk_slice, text_slice);
673 }
674 }
675
676 #[gpui::test(iterations = 100)]
677 fn test_split_chunk_slice(mut rng: StdRng) {
678 let text = &random_string_with_utf8_len(&mut rng, MAX_BASE);
679 let chunk = Chunk::new(text);
680 let offset = char_offsets_with_end(text)
681 .into_iter()
682 .choose(&mut rng)
683 .unwrap();
684 let (a, b) = chunk.as_slice().split_at(offset);
685 let (a_str, b_str) = text.split_at(offset);
686 verify_chunk(a, a_str);
687 verify_chunk(b, b_str);
688 }
689
690 #[gpui::test(iterations = 1000)]
691 fn test_nth_set_bit_random(mut rng: StdRng) {
692 let set_count = rng.random_range(0..=128);
693 let mut set_bits = (0..128).choose_multiple(&mut rng, set_count);
694 set_bits.sort();
695 let mut n = 0;
696 for ix in set_bits.iter().copied() {
697 n |= 1 << ix;
698 }
699
700 for (mut ix, position) in set_bits.into_iter().enumerate() {
701 ix += 1;
702 assert_eq!(
703 nth_set_bit(n, ix),
704 position,
705 "nth_set_bit({:0128b}, {})",
706 n,
707 ix
708 );
709 }
710 }
711
712 /// Returns a (biased) random string whose UTF-8 length is no more than `len`.
713 fn random_string_with_utf8_len(rng: &mut StdRng, len: usize) -> String {
714 let mut str = String::new();
715 let mut chars = RandomCharIter::new(rng);
716 loop {
717 let ch = chars.next().unwrap();
718 if str.len() + ch.len_utf8() > len {
719 break;
720 }
721 str.push(ch);
722 }
723 str
724 }
725
726 #[gpui::test(iterations = 1000)]
727 fn test_append_random_strings(mut rng: StdRng) {
728 let len1 = rng.random_range(0..=MAX_BASE);
729 let len2 = rng.random_range(0..=MAX_BASE).saturating_sub(len1);
730 let str1 = random_string_with_utf8_len(&mut rng, len1);
731 let str2 = random_string_with_utf8_len(&mut rng, len2);
732 let mut chunk1 = Chunk::new(&str1);
733 let chunk2 = Chunk::new(&str2);
734 let char_offsets = char_offsets_with_end(&str2);
735 let start_index = rng.random_range(0..char_offsets.len());
736 let start_offset = char_offsets[start_index];
737 let end_offset = char_offsets[rng.random_range(start_index..char_offsets.len())];
738 chunk1.append(chunk2.slice(start_offset..end_offset));
739 verify_chunk(chunk1.as_slice(), &(str1 + &str2[start_offset..end_offset]));
740 }
741
742 /// Return the byte offsets for each character in a string.
743 ///
744 /// These are valid offsets to split the string.
745 fn char_offsets(text: &str) -> Vec<usize> {
746 text.char_indices().map(|(i, _c)| i).collect()
747 }
748
749 /// Return the byte offsets for each character in a string, plus the offset
750 /// past the end of the string.
751 fn char_offsets_with_end(text: &str) -> Vec<usize> {
752 let mut v = char_offsets(text);
753 v.push(text.len());
754 v
755 }
756
757 fn verify_chunk(chunk: ChunkSlice<'_>, text: &str) {
758 let mut offset = 0;
759 let mut offset_utf16 = OffsetUtf16(0);
760 let mut point = Point::zero();
761 let mut point_utf16 = PointUtf16::zero();
762
763 log::info!("Verifying chunk {:?}", text);
764 assert_eq!(chunk.offset_to_point(0), Point::zero());
765
766 let mut expected_tab_positions = Vec::new();
767
768 for (char_offset, c) in text.chars().enumerate() {
769 let expected_point = chunk.offset_to_point(offset);
770 assert_eq!(point, expected_point, "mismatch at offset {}", offset);
771 assert_eq!(
772 chunk.point_to_offset(point),
773 offset,
774 "mismatch at point {:?}",
775 point
776 );
777 assert_eq!(
778 chunk.offset_to_offset_utf16(offset),
779 offset_utf16,
780 "mismatch at offset {}",
781 offset
782 );
783 assert_eq!(
784 chunk.offset_utf16_to_offset(offset_utf16),
785 offset,
786 "mismatch at offset_utf16 {:?}",
787 offset_utf16
788 );
789 assert_eq!(
790 chunk.point_to_point_utf16(point),
791 point_utf16,
792 "mismatch at point {:?}",
793 point
794 );
795 assert_eq!(
796 chunk.point_utf16_to_offset(point_utf16, false),
797 offset,
798 "mismatch at point_utf16 {:?}",
799 point_utf16
800 );
801 assert_eq!(
802 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
803 point,
804 "mismatch for unclipped_point_utf16_to_point at {:?}",
805 point_utf16
806 );
807
808 assert_eq!(
809 chunk.clip_point(point, Bias::Left),
810 point,
811 "incorrect left clip at {:?}",
812 point
813 );
814 assert_eq!(
815 chunk.clip_point(point, Bias::Right),
816 point,
817 "incorrect right clip at {:?}",
818 point
819 );
820
821 for i in 1..c.len_utf8() {
822 let test_point = Point::new(point.row, point.column + i as u32);
823 assert_eq!(
824 chunk.clip_point(test_point, Bias::Left),
825 point,
826 "incorrect left clip within multi-byte char at {:?}",
827 test_point
828 );
829 assert_eq!(
830 chunk.clip_point(test_point, Bias::Right),
831 Point::new(point.row, point.column + c.len_utf8() as u32),
832 "incorrect right clip within multi-byte char at {:?}",
833 test_point
834 );
835 }
836
837 for i in 1..c.len_utf16() {
838 let test_point = Unclipped(PointUtf16::new(
839 point_utf16.row,
840 point_utf16.column + i as u32,
841 ));
842 assert_eq!(
843 chunk.unclipped_point_utf16_to_point(test_point),
844 point,
845 "incorrect unclipped_point_utf16_to_point within multi-byte char at {:?}",
846 test_point
847 );
848 assert_eq!(
849 chunk.clip_point_utf16(test_point, Bias::Left),
850 point_utf16,
851 "incorrect left clip_point_utf16 within multi-byte char at {:?}",
852 test_point
853 );
854 assert_eq!(
855 chunk.clip_point_utf16(test_point, Bias::Right),
856 PointUtf16::new(point_utf16.row, point_utf16.column + c.len_utf16() as u32),
857 "incorrect right clip_point_utf16 within multi-byte char at {:?}",
858 test_point
859 );
860
861 let test_offset = OffsetUtf16(offset_utf16.0 + i);
862 assert_eq!(
863 chunk.clip_offset_utf16(test_offset, Bias::Left),
864 offset_utf16,
865 "incorrect left clip_offset_utf16 within multi-byte char at {:?}",
866 test_offset
867 );
868 assert_eq!(
869 chunk.clip_offset_utf16(test_offset, Bias::Right),
870 OffsetUtf16(offset_utf16.0 + c.len_utf16()),
871 "incorrect right clip_offset_utf16 within multi-byte char at {:?}",
872 test_offset
873 );
874 }
875
876 if c == '\n' {
877 point.row += 1;
878 point.column = 0;
879 point_utf16.row += 1;
880 point_utf16.column = 0;
881 } else {
882 point.column += c.len_utf8() as u32;
883 point_utf16.column += c.len_utf16() as u32;
884 }
885
886 if c == '\t' {
887 expected_tab_positions.push(TabPosition {
888 byte_offset: offset,
889 char_offset,
890 });
891 }
892
893 offset += c.len_utf8();
894 offset_utf16.0 += c.len_utf16();
895 }
896
897 let final_point = chunk.offset_to_point(offset);
898 assert_eq!(point, final_point, "mismatch at final offset {}", offset);
899 assert_eq!(
900 chunk.point_to_offset(point),
901 offset,
902 "mismatch at point {:?}",
903 point
904 );
905 assert_eq!(
906 chunk.offset_to_offset_utf16(offset),
907 offset_utf16,
908 "mismatch at offset {}",
909 offset
910 );
911 assert_eq!(
912 chunk.offset_utf16_to_offset(offset_utf16),
913 offset,
914 "mismatch at offset_utf16 {:?}",
915 offset_utf16
916 );
917 assert_eq!(
918 chunk.point_to_point_utf16(point),
919 point_utf16,
920 "mismatch at final point {:?}",
921 point
922 );
923 assert_eq!(
924 chunk.point_utf16_to_offset(point_utf16, false),
925 offset,
926 "mismatch at final point_utf16 {:?}",
927 point_utf16
928 );
929 assert_eq!(
930 chunk.unclipped_point_utf16_to_point(Unclipped(point_utf16)),
931 point,
932 "mismatch for unclipped_point_utf16_to_point at final point {:?}",
933 point_utf16
934 );
935 assert_eq!(
936 chunk.clip_point(point, Bias::Left),
937 point,
938 "incorrect left clip at final point {:?}",
939 point
940 );
941 assert_eq!(
942 chunk.clip_point(point, Bias::Right),
943 point,
944 "incorrect right clip at final point {:?}",
945 point
946 );
947 assert_eq!(
948 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Left),
949 point_utf16,
950 "incorrect left clip_point_utf16 at final point {:?}",
951 point_utf16
952 );
953 assert_eq!(
954 chunk.clip_point_utf16(Unclipped(point_utf16), Bias::Right),
955 point_utf16,
956 "incorrect right clip_point_utf16 at final point {:?}",
957 point_utf16
958 );
959 assert_eq!(
960 chunk.clip_offset_utf16(offset_utf16, Bias::Left),
961 offset_utf16,
962 "incorrect left clip_offset_utf16 at final offset {:?}",
963 offset_utf16
964 );
965 assert_eq!(
966 chunk.clip_offset_utf16(offset_utf16, Bias::Right),
967 offset_utf16,
968 "incorrect right clip_offset_utf16 at final offset {:?}",
969 offset_utf16
970 );
971
972 // Verify length methods
973 assert_eq!(chunk.len(), text.len());
974 assert_eq!(
975 chunk.len_utf16().0,
976 text.chars().map(|c| c.len_utf16()).sum::<usize>()
977 );
978
979 // Verify line counting
980 let lines = chunk.lines();
981 let mut newline_count = 0;
982 let mut last_line_len = 0;
983 for c in text.chars() {
984 if c == '\n' {
985 newline_count += 1;
986 last_line_len = 0;
987 } else {
988 last_line_len += c.len_utf8() as u32;
989 }
990 }
991 assert_eq!(lines, Point::new(newline_count, last_line_len));
992
993 // Verify first/last line chars
994 if !text.is_empty() {
995 let first_line = text.split('\n').next().unwrap();
996 assert_eq!(chunk.first_line_chars(), first_line.chars().count() as u32);
997
998 let last_line = text.split('\n').next_back().unwrap();
999 assert_eq!(chunk.last_line_chars(), last_line.chars().count() as u32);
1000 assert_eq!(
1001 chunk.last_line_len_utf16(),
1002 last_line.chars().map(|c| c.len_utf16() as u32).sum::<u32>()
1003 );
1004 }
1005
1006 // Verify longest row
1007 let (longest_row, longest_chars) = chunk.longest_row(&mut 0);
1008 let mut max_chars = 0;
1009 let mut current_row = 0;
1010 let mut current_chars = 0;
1011 let mut max_row = 0;
1012
1013 for c in text.chars() {
1014 if c == '\n' {
1015 if current_chars > max_chars {
1016 max_chars = current_chars;
1017 max_row = current_row;
1018 }
1019 current_row += 1;
1020 current_chars = 0;
1021 } else {
1022 current_chars += 1;
1023 }
1024 }
1025
1026 if current_chars > max_chars {
1027 max_chars = current_chars;
1028 max_row = current_row;
1029 }
1030
1031 assert_eq!((max_row, max_chars as u32), (longest_row, longest_chars));
1032 assert_eq!(chunk.tabs().collect::<Vec<_>>(), expected_tab_positions);
1033 }
1034}