From 138286f3b138cc653456376860dacddda6b18093 Mon Sep 17 00:00:00 2001 From: Vasyl Protsiv Date: Mon, 24 Nov 2025 15:49:00 +0200 Subject: [PATCH] sum_tree: Make SumTree::append run in logarithmic time (#43349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `SumTree::append` method is slow when appending large trees to small trees. The reason is this code here: https://github.com/zed-industries/zed/blob/f57f4cd3607e8298ef5f1b29929df2db0185d826/crates/sum_tree/src/sum_tree.rs#L628-L630 `append` is called recursively until `self` and `other` have the same height, effectively making this code `O(log^2 n)` in the number of leaves of `other` tree in the worst case. There are no algorithmic reasons why appending large trees must be this much slower. This PR proves it by providing implementation of `append` that works in logarithmic time regardless if `self` is smaller or larger than `other`. The helper method `append_large` has the symmetric logic to `push_tree_recursive` but moves the (unlikely) case of merging underflowing node in a separate helper function to reduce stack usage. I am a bit unsure about some implementation choices made in `push_tree_recursive` and would like to discuss some of these later, but at the moment I didn't change anything there and tried to follow the same logic in `append_large`. We might also consider adding `push_front`/`prepend` methods to `SumTree`. I did not find a good benchmark that covers this case so I added a new one to rope benchmarks.
cargo bench (compared to current main) ``` Running benches\rope_benchmark.rs (D:\zed\target\release\deps\rope_benchmark-59c669d2895cd2c4.exe) Gnuplot not found, using plotters backend push/4096 time: [195.67 µs 195.75 µs 195.86 µs] thrpt: [19.944 MiB/s 19.955 MiB/s 19.964 MiB/s] change: time: [+0.2162% +0.3040% +0.4057%] (p = 0.00 < 0.05) thrpt: [-0.4040% -0.3030% -0.2157%] Change within noise threshold. Found 14 outliers among 100 measurements (14.00%) 2 (2.00%) low mild 6 (6.00%) high mild 6 (6.00%) high severe Benchmarking push/65536: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 7.8s, enable flat sampling, or reduce sample count to 50. push/65536 time: [1.4431 ms 1.4485 ms 1.4546 ms] thrpt: [42.966 MiB/s 43.147 MiB/s 43.310 MiB/s] change: time: [-3.2257% -1.2013% +0.6431%] (p = 0.27 > 0.05) thrpt: [-0.6390% +1.2159% +3.3332%] No change in performance detected. Found 11 outliers among 100 measurements (11.00%) 1 (1.00%) low mild 5 (5.00%) high mild 5 (5.00%) high severe append/4096 time: [15.107 µs 15.128 µs 15.149 µs] thrpt: [257.86 MiB/s 258.22 MiB/s 258.58 MiB/s] change: time: [+0.9650% +1.5256% +1.9057%] (p = 0.00 < 0.05) thrpt: [-1.8701% -1.5026% -0.9557%] Change within noise threshold. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) low mild 1 (1.00%) high severe append/65536 time: [1.2870 µs 1.4496 µs 1.6484 µs] thrpt: [37.028 GiB/s 42.106 GiB/s 47.425 GiB/s] change: time: [-28.699% -16.073% -0.3133%] (p = 0.04 < 0.05) thrpt: [+0.3142% +19.151% +40.250%] Change within noise threshold. Found 17 outliers among 100 measurements (17.00%) 1 (1.00%) high mild 16 (16.00%) high severe slice/4096 time: [30.580 µs 30.611 µs 30.639 µs] thrpt: [127.49 MiB/s 127.61 MiB/s 127.74 MiB/s] change: time: [-2.2958% -0.9674% -0.1835%] (p = 0.08 > 0.05) thrpt: [+0.1838% +0.9769% +2.3498%] No change in performance detected. slice/65536 time: [614.86 µs 795.04 µs 1.0293 ms] thrpt: [60.723 MiB/s 78.613 MiB/s 101.65 MiB/s] change: time: [-12.714% +7.2092% +30.676%] (p = 0.52 > 0.05) thrpt: [-23.475% -6.7244% +14.566%] No change in performance detected. Found 14 outliers among 100 measurements (14.00%) 14 (14.00%) high severe bytes_in_range/4096 time: [3.3298 µs 3.3416 µs 3.3563 µs] thrpt: [1.1366 GiB/s 1.1416 GiB/s 1.1456 GiB/s] change: time: [+2.0652% +3.0667% +4.3765%] (p = 0.00 < 0.05) thrpt: [-4.1930% -2.9754% -2.0234%] Performance has regressed. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high severe bytes_in_range/65536 time: [80.640 µs 80.825 µs 81.024 µs] thrpt: [771.38 MiB/s 773.28 MiB/s 775.05 MiB/s] change: time: [-0.6566% +1.0994% +2.9691%] (p = 0.27 > 0.05) thrpt: [-2.8835% -1.0875% +0.6609%] No change in performance detected. Found 10 outliers among 100 measurements (10.00%) 2 (2.00%) high mild 8 (8.00%) high severe chars/4096 time: [763.17 ns 763.68 ns 764.36 ns] thrpt: [4.9907 GiB/s 4.9952 GiB/s 4.9985 GiB/s] change: time: [-2.1138% -0.7973% +0.1096%] (p = 0.18 > 0.05) thrpt: [-0.1095% +0.8037% +2.1595%] No change in performance detected. Found 10 outliers among 100 measurements (10.00%) 1 (1.00%) low severe 6 (6.00%) low mild 3 (3.00%) high severe chars/65536 time: [12.479 µs 12.503 µs 12.529 µs] thrpt: [4.8714 GiB/s 4.8817 GiB/s 4.8910 GiB/s] change: time: [-2.4451% -1.0638% +0.6633%] (p = 0.16 > 0.05) thrpt: [-0.6589% +1.0753% +2.5063%] No change in performance detected. Found 11 outliers among 100 measurements (11.00%) 4 (4.00%) high mild 7 (7.00%) high severe clip_point/4096 time: [63.148 µs 63.182 µs 63.229 µs] thrpt: [61.779 MiB/s 61.825 MiB/s 61.859 MiB/s] change: time: [+1.0107% +2.1329% +4.2849%] (p = 0.02 < 0.05) thrpt: [-4.1088% -2.0883% -1.0006%] Performance has regressed. Found 5 outliers among 100 measurements (5.00%) 4 (4.00%) high mild 1 (1.00%) high severe Benchmarking clip_point/65536: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 7.8s, enable flat sampling, or reduce sample count to 50. clip_point/65536 time: [1.2578 ms 1.2593 ms 1.2608 ms] thrpt: [49.573 MiB/s 49.631 MiB/s 49.690 MiB/s] change: time: [+0.4881% +0.8942% +1.3488%] (p = 0.00 < 0.05) thrpt: [-1.3308% -0.8863% -0.4857%] Change within noise threshold. Found 15 outliers among 100 measurements (15.00%) 1 (1.00%) high mild 14 (14.00%) high severe point_to_offset/4096 time: [16.211 µs 16.235 µs 16.257 µs] thrpt: [240.28 MiB/s 240.61 MiB/s 240.97 MiB/s] change: time: [-1.4913% +0.1685% +2.2662%] (p = 0.89 > 0.05) thrpt: [-2.2159% -0.1682% +1.5139%] No change in performance detected. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe point_to_offset/65536 time: [360.06 µs 360.58 µs 361.16 µs] thrpt: [173.05 MiB/s 173.33 MiB/s 173.58 MiB/s] change: time: [+0.0939% +0.8792% +1.8751%] (p = 0.06 > 0.05) thrpt: [-1.8406% -0.8715% -0.0938%] No change in performance detected. Found 10 outliers among 100 measurements (10.00%) 3 (3.00%) high mild 7 (7.00%) high severe cursor/4096 time: [19.266 µs 19.282 µs 19.302 µs] thrpt: [202.38 MiB/s 202.58 MiB/s 202.75 MiB/s] change: time: [+1.2457% +2.2477% +2.8702%] (p = 0.00 < 0.05) thrpt: [-2.7901% -2.1983% -1.2304%] Performance has regressed. Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe cursor/65536 time: [467.63 µs 468.36 µs 469.14 µs] thrpt: [133.22 MiB/s 133.44 MiB/s 133.65 MiB/s] change: time: [-0.2019% +1.3419% +2.8915%] (p = 0.10 > 0.05) thrpt: [-2.8103% -1.3241% +0.2023%] No change in performance detected. Found 12 outliers among 100 measurements (12.00%) 3 (3.00%) high mild 9 (9.00%) high severe append many/small to large time: [37.419 ms 37.656 ms 37.929 ms] thrpt: [321.84 MiB/s 324.17 MiB/s 326.22 MiB/s] change: time: [+0.8113% +1.7361% +2.6538%] (p = 0.00 < 0.05) thrpt: [-2.5852% -1.7065% -0.8047%] Change within noise threshold. Found 9 outliers among 100 measurements (9.00%) 9 (9.00%) high severe append many/large to small time: [51.289 ms 51.437 ms 51.614 ms] thrpt: [236.50 MiB/s 237.32 MiB/s 238.00 MiB/s] change: time: [-87.518% -87.479% -87.438%] (p = 0.00 < 0.05) thrpt: [+696.08% +698.66% +701.13%] Performance has improved. Found 13 outliers among 100 measurements (13.00%) 4 (4.00%) high mild 9 (9.00%) high severe ```
Release Notes: - sum_tree: Make SumTree::append run in logarithmic time --- crates/rope/benches/rope_benchmark.rs | 29 ++++ crates/sum_tree/src/sum_tree.rs | 188 +++++++++++++++++++++++++- 2 files changed, 214 insertions(+), 3 deletions(-) diff --git a/crates/rope/benches/rope_benchmark.rs b/crates/rope/benches/rope_benchmark.rs index 030bec01df4d223cd5288842ba0f9c1386dac31b..8599328aacf73a9b846795ee19791f4b0c4c5c2c 100644 --- a/crates/rope/benches/rope_benchmark.rs +++ b/crates/rope/benches/rope_benchmark.rs @@ -238,6 +238,35 @@ fn rope_benchmarks(c: &mut Criterion) { }); } group.finish(); + + let mut group = c.benchmark_group("append many"); + group.throughput(Throughput::Bytes(128 * 100_000)); + + group.bench_function("small to large", |b| { + b.iter(|| { + let mut rope = Rope::new(); + let small = Rope::from("A".repeat(128)); + for _ in 0..100_000 { + rope.append(small.clone()); + } + assert_eq!(rope.len(), 128 * 100_000); + }); + }); + + group.bench_function("large to small", |b| { + b.iter(|| { + let mut rope = Rope::new(); + let small = Rope::from("A".repeat(128)); + for _ in 0..100_000 { + let large = rope; + rope = small.clone(); + rope.append(large); + } + assert_eq!(rope.len(), 128 * 100_000); + }); + }); + + group.finish(); } criterion_group!(benches, rope_benchmarks); diff --git a/crates/sum_tree/src/sum_tree.rs b/crates/sum_tree/src/sum_tree.rs index 95fbd5ed0d5f5700d0c894cda68ed15ce6590ced..da700201f558a0b29ed4dc45bd3d3d3e7474a297 100644 --- a/crates/sum_tree/src/sum_tree.rs +++ b/crates/sum_tree/src/sum_tree.rs @@ -620,13 +620,15 @@ impl SumTree { ); } - pub fn append(&mut self, other: Self, cx: ::Context<'_>) { + pub fn append(&mut self, mut other: Self, cx: ::Context<'_>) { if self.is_empty() { *self = other; } else if !other.0.is_leaf() || !other.0.items().is_empty() { if self.0.height() < other.0.height() { - for tree in other.0.child_trees() { - self.append(tree.clone(), cx); + if let Some(tree) = Self::append_large(self.clone(), &mut other, cx) { + *self = Self::from_child_trees(tree, other, cx); + } else { + *self = other; } } else if let Some(split_tree) = self.push_tree_recursive(other, cx) { *self = Self::from_child_trees(self.clone(), split_tree, cx); @@ -754,6 +756,186 @@ impl SumTree { } } + // appends the `large` tree to a `small` tree, assumes small.height() <= large.height() + fn append_large( + small: Self, + large: &mut Self, + cx: ::Context<'_>, + ) -> Option { + if small.0.height() == large.0.height() { + if !small.0.is_underflowing() { + Some(small) + } else { + Self::merge_into_right(small, large, cx) + } + } else { + debug_assert!(small.0.height() < large.0.height()); + let Node::Internal { + height, + summary, + child_summaries, + child_trees, + } = Arc::make_mut(&mut large.0) + else { + unreachable!(); + }; + let mut full_summary = small.summary().clone(); + Summary::add_summary(&mut full_summary, summary, cx); + *summary = full_summary; + + let first = child_trees.first_mut().unwrap(); + let res = Self::append_large(small, first, cx); + *child_summaries.first_mut().unwrap() = first.summary().clone(); + if let Some(tree) = res { + if child_trees.len() < 2 * TREE_BASE { + child_summaries.insert(0, tree.summary().clone()); + child_trees.insert(0, tree); + None + } else { + let new_child_summaries = { + let mut res = ArrayVec::from_iter([tree.summary().clone()]); + res.extend(child_summaries.drain(..TREE_BASE)); + res + }; + let tree = SumTree(Arc::new(Node::Internal { + height: *height, + summary: sum(new_child_summaries.iter(), cx), + child_summaries: new_child_summaries, + child_trees: { + let mut res = ArrayVec::from_iter([tree]); + res.extend(child_trees.drain(..TREE_BASE)); + res + }, + })); + + *summary = sum(child_summaries.iter(), cx); + Some(tree) + } + } else { + None + } + } + } + + // Merge two nodes into `large`. + // + // `large` will contain the contents of `small` followed by its own data. + // If the combined data exceed the node capacity, returns a new node that + // holds the first half of the merged items and `large` is left with the + // second half + // + // The nodes must be on the same height + // It only makes sense to call this when `small` is underflowing + fn merge_into_right( + small: Self, + large: &mut Self, + cx: <::Summary as Summary>::Context<'_>, + ) -> Option> { + debug_assert_eq!(small.0.height(), large.0.height()); + match (small.0.as_ref(), Arc::make_mut(&mut large.0)) { + ( + Node::Internal { + summary: small_summary, + child_summaries: small_child_summaries, + child_trees: small_child_trees, + .. + }, + Node::Internal { + summary, + child_summaries, + child_trees, + height, + }, + ) => { + let total_child_count = child_trees.len() + small_child_trees.len(); + if total_child_count <= 2 * TREE_BASE { + let mut all_trees = small_child_trees.clone(); + all_trees.extend(child_trees.drain(..)); + *child_trees = all_trees; + + let mut all_summaries = small_child_summaries.clone(); + all_summaries.extend(child_summaries.drain(..)); + *child_summaries = all_summaries; + + let mut full_summary = small_summary.clone(); + Summary::add_summary(&mut full_summary, summary, cx); + *summary = full_summary; + None + } else { + let midpoint = total_child_count.div_ceil(2); + let mut all_trees = small_child_trees.iter().chain(child_trees.iter()).cloned(); + let left_trees = all_trees.by_ref().take(midpoint).collect(); + *child_trees = all_trees.collect(); + + let mut all_summaries = small_child_summaries + .iter() + .chain(child_summaries.iter()) + .cloned(); + let left_summaries: ArrayVec<_, { 2 * TREE_BASE }> = + all_summaries.by_ref().take(midpoint).collect(); + *child_summaries = all_summaries.collect(); + + *summary = sum(child_summaries.iter(), cx); + Some(SumTree(Arc::new(Node::Internal { + height: *height, + summary: sum(left_summaries.iter(), cx), + child_summaries: left_summaries, + child_trees: left_trees, + }))) + } + } + ( + Node::Leaf { + summary: small_summary, + items: small_items, + item_summaries: small_item_summaries, + }, + Node::Leaf { + summary, + items, + item_summaries, + }, + ) => { + let total_child_count = small_items.len() + items.len(); + if total_child_count <= 2 * TREE_BASE { + let mut all_items = small_items.clone(); + all_items.extend(items.drain(..)); + *items = all_items; + + let mut all_summaries = small_item_summaries.clone(); + all_summaries.extend(item_summaries.drain(..)); + *item_summaries = all_summaries; + + let mut full_summary = small_summary.clone(); + Summary::add_summary(&mut full_summary, summary, cx); + *summary = full_summary; + None + } else { + let midpoint = total_child_count.div_ceil(2); + let mut all_items = small_items.iter().chain(items.iter()).cloned(); + let left_items = all_items.by_ref().take(midpoint).collect(); + *items = all_items.collect(); + + let mut all_summaries = small_item_summaries + .iter() + .chain(item_summaries.iter()) + .cloned(); + let left_summaries: ArrayVec<_, { 2 * TREE_BASE }> = + all_summaries.by_ref().take(midpoint).collect(); + *item_summaries = all_summaries.collect(); + + *summary = sum(item_summaries.iter(), cx); + Some(SumTree(Arc::new(Node::Leaf { + items: left_items, + summary: sum(left_summaries.iter(), cx), + item_summaries: left_summaries, + }))) + } + } + _ => unreachable!(), + } + } + fn from_child_trees( left: SumTree, right: SumTree,