Don't preallocate 600MB for GPUI profiler (#45197)

Martin Pool , MrSubidubi , and Copilot created 2 weeks ago

Previously, the GPUI profiler allocates one CircularBuffer per thread,
and `CircularBuffer<N>` always preallocates space for N entries. As a
result it allocates ~20MB/thread, and on my machine about 33 threads are
created at startup for a total of 600MB used.

In this PR I change it to use a VecDeque that can gradually grow up to
20MB as data is written. At least in my experiments it seems that this
caps overall usage at about 21MB perhaps because only one thread writes
very much usage data.

Since this is fixed overhead for everyone running Zed it seems like a
worthwhile gain.

This also folds duplicated code across platforms into the common gpui
profiler.

Before:

<img width="4804" height="2192" alt="Image"
src="https://github.com/user-attachments/assets/7060ee5b-ef80-49cb-b7be-de33e9a2e7a5"
/>

After:

<img width="5052" height="1858" alt="image"
src="https://github.com/user-attachments/assets/513494df-0974-4604-9796-15a12ef1c134"
/>

I got here from #35780 but I don't think this is tree-size related, it
seems to be fixed overhead.

Release Notes:

- Improved: Significantly less memory used to record internal profiling
information.

---------

Co-authored-by: MrSubidubi <finn@zed.dev>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Change summary

Cargo.lock                                |  1 
crates/gpui/Cargo.toml                    |  1 
crates/gpui/src/profiler.rs               | 65 ++++++++++++++++--------
crates/gpui_linux/src/linux/dispatcher.rs | 22 -------
crates/gpui_macos/src/dispatcher.rs       | 49 ++----------------
crates/gpui_windows/src/dispatcher.rs     | 22 -------
6 files changed, 55 insertions(+), 105 deletions(-)

Detailed changes

Cargo.lock 🔗

@@ -7605,7 +7605,6 @@ dependencies = [
  "block",
  "cbindgen",
  "chrono",
- "circular-buffer",
  "cocoa 0.26.0",
  "cocoa-foundation 0.2.0",
  "collections",

crates/gpui/Cargo.toml 🔗

@@ -98,7 +98,6 @@ gpui_util.workspace = true
 waker-fn = "1.2.0"
 lyon = "1.0"
 pin-project = "1.1.10"
-circular-buffer.workspace = true
 spin = "0.10.0"
 pollster.workspace = true
 url.workspace = true

crates/gpui/src/profiler.rs 🔗

@@ -1,9 +1,8 @@
 use scheduler::Instant;
 use std::{
     cell::LazyCell,
-    collections::HashMap,
-    hash::Hasher,
-    hash::{DefaultHasher, Hash},
+    collections::{HashMap, VecDeque},
+    hash::{DefaultHasher, Hash, Hasher},
     sync::Arc,
     thread::ThreadId,
 };
@@ -45,7 +44,6 @@ impl ThreadTaskTimings {
                 let timings = &timings.timings;
 
                 let mut vec = Vec::with_capacity(timings.len());
-
                 let (s1, s2) = timings.as_slices();
                 vec.extend_from_slice(s1);
                 vec.extend_from_slice(s2);
@@ -243,11 +241,14 @@ impl ProfilingCollector {
     }
 }
 
-// Allow 20mb of task timing entries
-const MAX_TASK_TIMINGS: usize = (20 * 1024 * 1024) / core::mem::size_of::<TaskTiming>();
+// Allow 16MiB of task timing entries.
+// VecDeque grows by doubling its capacity when full, so keep this a power of 2 to avoid wasting
+// memory.
+const MAX_TASK_TIMINGS: usize = (16 * 1024 * 1024) / core::mem::size_of::<TaskTiming>();
 
 #[doc(hidden)]
-pub type TaskTimings = circular_buffer::CircularBuffer<MAX_TASK_TIMINGS, TaskTiming>;
+pub(crate) type TaskTimings = VecDeque<TaskTiming>;
+
 #[doc(hidden)]
 pub type GuardedTaskTimings = spin::Mutex<ThreadTimings>;
 
@@ -287,7 +288,7 @@ thread_local! {
 pub struct ThreadTimings {
     pub thread_name: Option<String>,
     pub thread_id: ThreadId,
-    pub timings: Box<TaskTimings>,
+    pub timings: TaskTimings,
     pub total_pushed: u64,
 }
 
@@ -296,10 +297,38 @@ impl ThreadTimings {
         ThreadTimings {
             thread_name,
             thread_id,
-            timings: TaskTimings::boxed(),
+            timings: TaskTimings::new(),
             total_pushed: 0,
         }
     }
+
+    /// If this task is the same as the last task, update the end time of the last task.
+    ///
+    /// Otherwise, add the new task timing to the list.
+    pub fn add_task_timing(&mut self, timing: TaskTiming) {
+        if let Some(last_timing) = self.timings.back_mut()
+            && last_timing.location == timing.location
+            && last_timing.start == timing.start
+        {
+            last_timing.end = timing.end;
+        } else {
+            while self.timings.len() + 1 > MAX_TASK_TIMINGS {
+                // This should only ever pop one element because it matches the insertion below.
+                self.timings.pop_front();
+            }
+            self.timings.push_back(timing);
+            self.total_pushed += 1;
+        }
+    }
+
+    pub fn get_thread_task_timings(&self) -> ThreadTaskTimings {
+        ThreadTaskTimings {
+            thread_name: self.thread_name.clone(),
+            thread_id: self.thread_id,
+            timings: self.timings.iter().cloned().collect(),
+            total_pushed: self.total_pushed,
+        }
+    }
 }
 
 impl Drop for ThreadTimings {
@@ -318,19 +347,13 @@ impl Drop for ThreadTimings {
 }
 
 #[doc(hidden)]
-#[allow(dead_code)] // Used by Linux and Windows dispatchers, not macOS
 pub fn add_task_timing(timing: TaskTiming) {
     THREAD_TIMINGS.with(|timings| {
-        let mut timings = timings.lock();
-
-        if let Some(last_timing) = timings.timings.back_mut() {
-            if last_timing.location == timing.location && last_timing.start == timing.start {
-                last_timing.end = timing.end;
-                return;
-            }
-        }
-
-        timings.timings.push_back(timing);
-        timings.total_pushed += 1;
+        timings.lock().add_task_timing(timing);
     });
 }
+
+#[doc(hidden)]
+pub fn get_current_thread_task_timings() -> ThreadTaskTimings {
+    THREAD_TIMINGS.with(|timings| timings.lock().get_thread_task_timings())
+}

crates/gpui_linux/src/linux/dispatcher.rs 🔗

@@ -13,7 +13,7 @@ use std::{
 
 use gpui::{
     GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, PriorityQueueReceiver,
-    PriorityQueueSender, RunnableVariant, THREAD_TIMINGS, TaskTiming, ThreadTaskTimings, profiler,
+    PriorityQueueSender, RunnableVariant, TaskTiming, ThreadTaskTimings, profiler,
 };
 
 struct TimerAfter {
@@ -135,25 +135,7 @@ impl PlatformDispatcher for LinuxDispatcher {
     }
 
     fn get_current_thread_timings(&self) -> gpui::ThreadTaskTimings {
-        THREAD_TIMINGS.with(|timings| {
-            let timings = timings.lock();
-            let thread_name = timings.thread_name.clone();
-            let total_pushed = timings.total_pushed;
-            let timings = &timings.timings;
-
-            let mut vec = Vec::with_capacity(timings.len());
-
-            let (s1, s2) = timings.as_slices();
-            vec.extend_from_slice(s1);
-            vec.extend_from_slice(s2);
-
-            gpui::ThreadTaskTimings {
-                thread_name,
-                thread_id: std::thread::current().id(),
-                timings: vec,
-                total_pushed,
-            }
-        })
+        gpui::profiler::get_current_thread_task_timings()
     }
 
     fn is_main_thread(&self) -> bool {

crates/gpui_macos/src/dispatcher.rs 🔗

@@ -1,7 +1,7 @@
 use dispatch2::{DispatchQueue, DispatchQueueGlobalPriority, DispatchTime, GlobalQueueIdentifier};
 use gpui::{
-    GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, RunnableMeta, RunnableVariant,
-    THREAD_TIMINGS, TaskTiming, ThreadTaskTimings,
+    GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, RunnableMeta, RunnableVariant, TaskTiming,
+    ThreadTaskTimings, add_task_timing,
 };
 use mach2::{
     kern_return::KERN_SUCCESS,
@@ -42,25 +42,7 @@ impl PlatformDispatcher for MacDispatcher {
     }
 
     fn get_current_thread_timings(&self) -> ThreadTaskTimings {
-        THREAD_TIMINGS.with(|timings| {
-            let timings = timings.lock();
-            let thread_name = timings.thread_name.clone();
-            let total_pushed = timings.total_pushed;
-            let timings = &timings.timings;
-
-            let mut vec = Vec::with_capacity(timings.len());
-
-            let (s1, s2) = timings.as_slices();
-            vec.extend_from_slice(s1);
-            vec.extend_from_slice(s2);
-
-            ThreadTaskTimings {
-                thread_name,
-                thread_id: std::thread::current().id(),
-                timings: vec,
-                total_pushed,
-            }
-        })
+        gpui::profiler::get_current_thread_task_timings()
     }
 
     fn is_main_thread(&self) -> bool {
@@ -204,33 +186,16 @@ extern "C" fn trampoline(context: *mut c_void) {
     let location = runnable.metadata().location;
 
     let start = Instant::now();
-    let timing = TaskTiming {
+    let mut timing = TaskTiming {
         location,
         start,
         end: None,
     };
 
-    THREAD_TIMINGS.with(|timings| {
-        let mut timings = timings.lock();
-        let timings = &mut timings.timings;
-        if let Some(last_timing) = timings.iter_mut().rev().next() {
-            if last_timing.location == timing.location {
-                return;
-            }
-        }
-
-        timings.push_back(timing);
-    });
+    add_task_timing(timing);
 
     runnable.run();
-    let end = Instant::now();
 
-    THREAD_TIMINGS.with(|timings| {
-        let mut timings = timings.lock();
-        let timings = &mut timings.timings;
-        let Some(last_timing) = timings.iter_mut().rev().next() else {
-            return;
-        };
-        last_timing.end = Some(end);
-    });
+    timing.end = Some(Instant::now());
+    add_task_timing(timing);
 }

crates/gpui_windows/src/dispatcher.rs 🔗

@@ -24,7 +24,7 @@ use windows::{
 use crate::{HWND, SafeHwnd, WM_GPUI_TASK_DISPATCHED_ON_MAIN_THREAD};
 use gpui::{
     GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, PriorityQueueSender, RunnableVariant,
-    THREAD_TIMINGS, TaskTiming, ThreadTaskTimings, TimerResolutionGuard,
+    TaskTiming, ThreadTaskTimings, TimerResolutionGuard,
 };
 
 pub(crate) struct WindowsDispatcher {
@@ -106,25 +106,7 @@ impl PlatformDispatcher for WindowsDispatcher {
     }
 
     fn get_current_thread_timings(&self) -> gpui::ThreadTaskTimings {
-        THREAD_TIMINGS.with(|timings| {
-            let timings = timings.lock();
-            let thread_name = timings.thread_name.clone();
-            let total_pushed = timings.total_pushed;
-            let timings = &timings.timings;
-
-            let mut vec = Vec::with_capacity(timings.len());
-
-            let (s1, s2) = timings.as_slices();
-            vec.extend_from_slice(s1);
-            vec.extend_from_slice(s2);
-
-            gpui::ThreadTaskTimings {
-                thread_name,
-                thread_id: std::thread::current().id(),
-                timings: vec,
-                total_pushed,
-            }
-        })
+        gpui::profiler::get_current_thread_task_timings()
     }
 
     fn is_main_thread(&self) -> bool {