From 6067436e9b52ba68b811e163ab21513be8869496 Mon Sep 17 00:00:00 2001 From: Vasyl Protsiv Date: Mon, 15 Dec 2025 09:25:50 +0200 Subject: [PATCH] rope: Optimize rope construction (#44345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have noticed you care about `SumTree` (and `Rope`) construction performance, hence using rayon for parallelism and careful `Chunk` splitting to avoid reallocation in `Rope::push`. It seemed strange to me that using multi-threading is that beneficial there, so I tried to investigate why the serial version (`SumTree::from_iter`) is slow in the first place. From my analysis I believe there are two main factors here: 1. `SumTree::from_iter` stores temporary `Node` values in a vector instead of heap-allocating them immediately and storing `SumTree` directly, as `SumTree::from_par_iter` does. 2. `Chunk::new` is quite slow: for some reason the compiler does not vectorize it and seems to struggle to optimize u128 shifts (at least on x86_64). For (1) the solution is simple: allocate `Node` immediately after construction, just like `SumTree::from_par_iter`. For (2) I was able to get better codegen by rewriting it into a simpler per-byte loop and splitting computation into smaller chunks to avoid slow u128 shifts. There was a similar effort recently in #43193 using portable_simd (currently nightly only) to optimize `Chunk::push_str`. From what I understand from that discussion, you seem okay with hand-rolled SIMD for specific architectures. If so, then I also provide sse2 implementation for x86_64. Feel free to remove it if you think this is unnecessary. To test performance I used a big CSV file (~1GB, mostly ASCII) and measured `Rope::from` with this program: ```rust fn main() { let text = std::fs::read_to_string("big.csv").unwrap(); let start = std::time::Instant::now(); let rope = rope::Rope::from(text); println!("{}ms, {}", start.elapsed().as_millis(), rope.len()); } ``` Here are results on my machine (Ryzen 7 4800H) | | Parallel | Serial | | ------------ | -------- | ------ | | Before | 1123ms | 9154ms | | After | 497ms | 2081ms | | After (sse2) | 480ms | 1454ms | Since serial performance is now much closer to parallel, I also increased `PARALLEL_THRESHOLD` to 1000. In my tests the parallel version starts to beat serial at around 150 KB strings. This constant might require more tweaking and testing though, especially on ARM64.
cargo bench (SSE2 vs before) ``` Running benches\rope_benchmark.rs (D:\zed\target\release\deps\rope_benchmark-3f8476f7dfb79154.exe) Gnuplot not found, using plotters backend push/4096 time: [43.592 µs 43.658 µs 43.733 µs] thrpt: [89.320 MiB/s 89.473 MiB/s 89.610 MiB/s] change: time: [-78.523% -78.222% -77.854%] (p = 0.00 < 0.05) thrpt: [+351.56% +359.19% +365.61%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe push/65536 time: [632.36 µs 634.03 µs 635.76 µs] thrpt: [98.308 MiB/s 98.576 MiB/s 98.836 MiB/s] change: time: [-51.521% -50.850% -50.325%] (p = 0.00 < 0.05) thrpt: [+101.31% +103.46% +106.28%] Performance has improved. Found 18 outliers among 100 measurements (18.00%) 11 (11.00%) low mild 6 (6.00%) high mild 1 (1.00%) high severe append/4096 time: [11.635 µs 11.664 µs 11.698 µs] thrpt: [333.92 MiB/s 334.89 MiB/s 335.72 MiB/s] change: time: [-24.543% -23.925% -22.660%] (p = 0.00 < 0.05) thrpt: [+29.298% +31.450% +32.525%] Performance has improved. Found 12 outliers among 100 measurements (12.00%) 2 (2.00%) low mild 2 (2.00%) high mild 8 (8.00%) high severe append/65536 time: [1.1287 µs 1.1324 µs 1.1360 µs] thrpt: [53.727 GiB/s 53.900 GiB/s 54.075 GiB/s] change: time: [-44.153% -37.614% -29.834%] (p = 0.00 < 0.05) thrpt: [+42.518% +60.292% +79.061%] Performance has improved. slice/4096 time: [28.340 µs 28.372 µs 28.406 µs] thrpt: [137.52 MiB/s 137.68 MiB/s 137.83 MiB/s] change: time: [-8.0798% -6.3955% -4.4109%] (p = 0.00 < 0.05) thrpt: [+4.6145% +6.8325% +8.7900%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) low mild 1 (1.00%) high mild 1 (1.00%) high severe slice/65536 time: [527.51 µs 528.17 µs 528.90 µs] thrpt: [118.17 MiB/s 118.33 MiB/s 118.48 MiB/s] change: time: [-53.819% -45.431% -34.578%] (p = 0.00 < 0.05) thrpt: [+52.853% +83.256% +116.54%] Performance has improved. Found 5 outliers among 100 measurements (5.00%) 1 (1.00%) low severe 3 (3.00%) low mild 1 (1.00%) high mild bytes_in_range/4096 time: [3.2545 µs 3.2646 µs 3.2797 µs] thrpt: [1.1631 GiB/s 1.1685 GiB/s 1.1721 GiB/s] change: time: [-3.4829% -2.4391% -1.7166%] (p = 0.00 < 0.05) thrpt: [+1.7466% +2.5001% +3.6085%] Performance has improved. Found 8 outliers among 100 measurements (8.00%) 6 (6.00%) high mild 2 (2.00%) high severe bytes_in_range/65536 time: [80.770 µs 80.832 µs 80.904 µs] thrpt: [772.52 MiB/s 773.21 MiB/s 773.80 MiB/s] change: time: [-1.8710% -1.3843% -0.9044%] (p = 0.00 < 0.05) thrpt: [+0.9126% +1.4037% +1.9067%] Change within noise threshold. Found 8 outliers among 100 measurements (8.00%) 5 (5.00%) high mild 3 (3.00%) high severe chars/4096 time: [790.50 ns 791.10 ns 791.88 ns] thrpt: [4.8173 GiB/s 4.8220 GiB/s 4.8257 GiB/s] change: time: [+0.4318% +1.4558% +2.0256%] (p = 0.00 < 0.05) thrpt: [-1.9854% -1.4349% -0.4299%] Change within noise threshold. Found 6 outliers among 100 measurements (6.00%) 1 (1.00%) low severe 1 (1.00%) low mild 2 (2.00%) high mild 2 (2.00%) high severe chars/65536 time: [12.672 µs 12.688 µs 12.703 µs] thrpt: [4.8046 GiB/s 4.8106 GiB/s 4.8164 GiB/s] change: time: [-2.7794% -1.2987% -0.2020%] (p = 0.04 < 0.05) thrpt: [+0.2025% +1.3158% +2.8588%] Change within noise threshold. Found 15 outliers among 100 measurements (15.00%) 1 (1.00%) low mild 12 (12.00%) high mild 2 (2.00%) high severe clip_point/4096 time: [63.009 µs 63.126 µs 63.225 µs] thrpt: [61.783 MiB/s 61.880 MiB/s 61.995 MiB/s] change: time: [+2.0484% +3.2218% +5.2181%] (p = 0.00 < 0.05) thrpt: [-4.9593% -3.1213% -2.0073%] Performance has regressed. Found 13 outliers among 100 measurements (13.00%) 12 (12.00%) low mild 1 (1.00%) high severe Benchmarking clip_point/65536: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 7.7s, enable flat sampling, or reduce sample count to 50. clip_point/65536 time: [1.2420 ms 1.2430 ms 1.2439 ms] thrpt: [50.246 MiB/s 50.283 MiB/s 50.322 MiB/s] change: time: [-0.3495% -0.0401% +0.1990%] (p = 0.80 > 0.05) thrpt: [-0.1986% +0.0401% +0.3507%] No change in performance detected. Found 7 outliers among 100 measurements (7.00%) 6 (6.00%) high mild 1 (1.00%) high severe point_to_offset/4096 time: [16.104 µs 16.119 µs 16.134 µs] thrpt: [242.11 MiB/s 242.33 MiB/s 242.56 MiB/s] change: time: [-1.3816% -0.2497% +2.2181%] (p = 0.84 > 0.05) thrpt: [-2.1699% +0.2503% +1.4009%] No change in performance detected. Found 6 outliers among 100 measurements (6.00%) 3 (3.00%) low mild 1 (1.00%) high mild 2 (2.00%) high severe point_to_offset/65536 time: [356.28 µs 356.57 µs 356.86 µs] thrpt: [175.14 MiB/s 175.28 MiB/s 175.42 MiB/s] change: time: [-3.7072% -2.3338% -1.4742%] (p = 0.00 < 0.05) thrpt: [+1.4962% +2.3896% +3.8499%] Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) low mild cursor/4096 time: [18.893 µs 18.934 µs 18.974 µs] thrpt: [205.87 MiB/s 206.31 MiB/s 206.76 MiB/s] change: time: [-2.3645% -2.0729% -1.7931%] (p = 0.00 < 0.05) thrpt: [+1.8259% +2.1168% +2.4218%] Performance has improved. Found 12 outliers among 100 measurements (12.00%) 12 (12.00%) high mild cursor/65536 time: [459.97 µs 460.40 µs 461.04 µs] thrpt: [135.56 MiB/s 135.75 MiB/s 135.88 MiB/s] change: time: [-5.7445% -4.2758% -3.1344%] (p = 0.00 < 0.05) thrpt: [+3.2358% +4.4668% +6.0946%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe append many/small to large time: [38.364 ms 38.620 ms 38.907 ms] thrpt: [313.75 MiB/s 316.08 MiB/s 318.19 MiB/s] change: time: [-0.2042% +1.0954% +2.3334%] (p = 0.10 > 0.05) thrpt: [-2.2802% -1.0836% +0.2046%] No change in performance detected. Found 21 outliers among 100 measurements (21.00%) 9 (9.00%) high mild 12 (12.00%) high severe append many/large to small time: [48.045 ms 48.322 ms 48.648 ms] thrpt: [250.92 MiB/s 252.62 MiB/s 254.07 MiB/s] change: time: [-6.5298% -5.6919% -4.8532%] (p = 0.00 < 0.05) thrpt: [+5.1007% +6.0354% +6.9859%] Performance has improved. Found 11 outliers among 100 measurements (11.00%) 2 (2.00%) high mild 9 (9.00%) high severe ```
Release Notes: - N/A *or* Added/Fixed/Improved ... --- crates/rope/src/chunk.rs | 61 ++++++++++++++++++++++++++------- crates/rope/src/rope.rs | 2 +- crates/sum_tree/src/sum_tree.rs | 22 ++++++------ 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/crates/rope/src/chunk.rs b/crates/rope/src/chunk.rs index a2a8e8d58df2d5ddc3336e8e56dd8446f4dcf118..c1916768c1f8a0980fb4d5aa1b718483b08c6087 100644 --- a/crates/rope/src/chunk.rs +++ b/crates/rope/src/chunk.rs @@ -47,22 +47,59 @@ impl Chunk { #[inline(always)] pub fn new(text: &str) -> Self { - let mut this = Chunk::default(); - this.push_str(text); - this + let text = ArrayString::from(text).unwrap(); + + const CHUNK_SIZE: usize = 8; + + let mut chars_bytes = [0; MAX_BASE / CHUNK_SIZE]; + let mut newlines_bytes = [0; MAX_BASE / CHUNK_SIZE]; + let mut tabs_bytes = [0; MAX_BASE / CHUNK_SIZE]; + let mut chars_utf16_bytes = [0; MAX_BASE / CHUNK_SIZE]; + + let mut chunk_ix = 0; + + let mut bytes = text.as_bytes(); + while !bytes.is_empty() { + let (chunk, rest) = bytes.split_at(bytes.len().min(CHUNK_SIZE)); + bytes = rest; + + let mut chars = 0; + let mut newlines = 0; + let mut tabs = 0; + let mut chars_utf16 = 0; + + for (ix, &b) in chunk.iter().enumerate() { + chars |= (util::is_utf8_char_boundary(b) as u8) << ix; + newlines |= ((b == b'\n') as u8) << ix; + tabs |= ((b == b'\t') as u8) << ix; + // b >= 240 when we are at the first byte of the 4 byte encoded + // utf-8 code point (U+010000 or greater) it means that it would + // be encoded as two 16-bit code units in utf-16 + chars_utf16 |= ((b >= 240) as u8) << ix; + } + + chars_bytes[chunk_ix] = chars; + newlines_bytes[chunk_ix] = newlines; + tabs_bytes[chunk_ix] = tabs; + chars_utf16_bytes[chunk_ix] = chars_utf16; + + chunk_ix += 1; + } + + let chars = Bitmap::from_le_bytes(chars_bytes); + + Chunk { + text, + chars, + chars_utf16: (Bitmap::from_le_bytes(chars_utf16_bytes) << 1) | chars, + newlines: Bitmap::from_le_bytes(newlines_bytes), + tabs: Bitmap::from_le_bytes(tabs_bytes), + } } #[inline(always)] pub fn push_str(&mut self, text: &str) { - for (char_ix, c) in text.char_indices() { - let ix = self.text.len() + char_ix; - self.chars |= 1 << ix; - self.chars_utf16 |= 1 << ix; - self.chars_utf16 |= (c.len_utf16() as Bitmap) << ix; - self.newlines |= ((c == '\n') as Bitmap) << ix; - self.tabs |= ((c == '\t') as Bitmap) << ix; - } - self.text.push_str(text); + self.append(Chunk::new(text).as_slice()); } #[inline(always)] diff --git a/crates/rope/src/rope.rs b/crates/rope/src/rope.rs index 50f9ba044d90072aa9c6fc2fc4abfd6d0e6b98cb..fba7b96aca83fa05c0d6f3e7992ad7443ec7958a 100644 --- a/crates/rope/src/rope.rs +++ b/crates/rope/src/rope.rs @@ -227,7 +227,7 @@ impl Rope { #[cfg(all(test, not(rust_analyzer)))] const PARALLEL_THRESHOLD: usize = 4; #[cfg(not(all(test, not(rust_analyzer))))] - const PARALLEL_THRESHOLD: usize = 4 * (2 * sum_tree::TREE_BASE); + const PARALLEL_THRESHOLD: usize = 84 * (2 * sum_tree::TREE_BASE); if new_chunks.len() >= PARALLEL_THRESHOLD { self.chunks diff --git a/crates/sum_tree/src/sum_tree.rs b/crates/sum_tree/src/sum_tree.rs index bfc4587969ec67bbda2fb90d34550c7d464317c9..6a76b73c3bbfb922e1b46fc1e228209ddf05b4a5 100644 --- a/crates/sum_tree/src/sum_tree.rs +++ b/crates/sum_tree/src/sum_tree.rs @@ -250,11 +250,11 @@ impl SumTree { ::add_summary(&mut summary, item_summary, cx); } - nodes.push(Node::Leaf { + nodes.push(SumTree(Arc::new(Node::Leaf { summary, items, item_summaries, - }); + }))); } let mut parent_nodes = Vec::new(); @@ -263,25 +263,27 @@ impl SumTree { height += 1; let mut current_parent_node = None; for child_node in nodes.drain(..) { - let parent_node = current_parent_node.get_or_insert_with(|| Node::Internal { - summary: ::zero(cx), - height, - child_summaries: ArrayVec::new(), - child_trees: ArrayVec::new(), + let parent_node = current_parent_node.get_or_insert_with(|| { + SumTree(Arc::new(Node::Internal { + summary: ::zero(cx), + height, + child_summaries: ArrayVec::new(), + child_trees: ArrayVec::new(), + })) }); let Node::Internal { summary, child_summaries, child_trees, .. - } = parent_node + } = Arc::get_mut(&mut parent_node.0).unwrap() else { unreachable!() }; let child_summary = child_node.summary(); ::add_summary(summary, child_summary, cx); child_summaries.push(child_summary.clone()); - child_trees.push(Self(Arc::new(child_node))); + child_trees.push(child_node); if child_trees.len() == 2 * TREE_BASE { parent_nodes.extend(current_parent_node.take()); @@ -295,7 +297,7 @@ impl SumTree { Self::new(cx) } else { debug_assert_eq!(nodes.len(), 1); - Self(Arc::new(nodes.pop().unwrap())) + nodes.pop().unwrap() } }