From a92283111b1e7ee1a1381c2307cddf8387347c8c Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Fri, 27 Mar 2026 04:20:21 -0600 Subject: [PATCH] Don't preallocate 600MB for GPUI profiler (#45197) Previously, the GPUI profiler allocates one CircularBuffer per thread, and `CircularBuffer` always preallocates space for N entries. As a result it allocates ~20MB/thread, and on my machine about 33 threads are created at startup for a total of 600MB used. In this PR I change it to use a VecDeque that can gradually grow up to 20MB as data is written. At least in my experiments it seems that this caps overall usage at about 21MB perhaps because only one thread writes very much usage data. Since this is fixed overhead for everyone running Zed it seems like a worthwhile gain. This also folds duplicated code across platforms into the common gpui profiler. Before: Image After: image I got here from #35780 but I don't think this is tree-size related, it seems to be fixed overhead. Release Notes: - Improved: Significantly less memory used to record internal profiling information. --------- Co-authored-by: MrSubidubi Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Cargo.lock | 1 - crates/gpui/Cargo.toml | 1 - crates/gpui/src/profiler.rs | 65 +++++++++++++++-------- crates/gpui_linux/src/linux/dispatcher.rs | 22 +------- crates/gpui_macos/src/dispatcher.rs | 49 +++-------------- crates/gpui_windows/src/dispatcher.rs | 22 +------- 6 files changed, 55 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6967770a5e1a6e09a2bd7a4f7e77a5307f9bfeb..dd421f8059245978ce45c0821c38e6105532b22b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7605,7 +7605,6 @@ dependencies = [ "block", "cbindgen", "chrono", - "circular-buffer", "cocoa 0.26.0", "cocoa-foundation 0.2.0", "collections", diff --git a/crates/gpui/Cargo.toml b/crates/gpui/Cargo.toml index cb4a48f63103118aafe78398d4842634c976ef9d..915f0fc03e2cc5beaf40c810654724295c41cde8 100644 --- a/crates/gpui/Cargo.toml +++ b/crates/gpui/Cargo.toml @@ -98,7 +98,6 @@ gpui_util.workspace = true waker-fn = "1.2.0" lyon = "1.0" pin-project = "1.1.10" -circular-buffer.workspace = true spin = "0.10.0" pollster.workspace = true url.workspace = true diff --git a/crates/gpui/src/profiler.rs b/crates/gpui/src/profiler.rs index dc6e9a6600f5c172050fd30cfed181ac7ed81ec4..1405b4d04964f5497bb4d7f865d6c4405507b43d 100644 --- a/crates/gpui/src/profiler.rs +++ b/crates/gpui/src/profiler.rs @@ -1,9 +1,8 @@ use scheduler::Instant; use std::{ cell::LazyCell, - collections::HashMap, - hash::Hasher, - hash::{DefaultHasher, Hash}, + collections::{HashMap, VecDeque}, + hash::{DefaultHasher, Hash, Hasher}, sync::Arc, thread::ThreadId, }; @@ -45,7 +44,6 @@ impl ThreadTaskTimings { let timings = &timings.timings; let mut vec = Vec::with_capacity(timings.len()); - let (s1, s2) = timings.as_slices(); vec.extend_from_slice(s1); vec.extend_from_slice(s2); @@ -243,11 +241,14 @@ impl ProfilingCollector { } } -// Allow 20mb of task timing entries -const MAX_TASK_TIMINGS: usize = (20 * 1024 * 1024) / core::mem::size_of::(); +// Allow 16MiB of task timing entries. +// VecDeque grows by doubling its capacity when full, so keep this a power of 2 to avoid wasting +// memory. +const MAX_TASK_TIMINGS: usize = (16 * 1024 * 1024) / core::mem::size_of::(); #[doc(hidden)] -pub type TaskTimings = circular_buffer::CircularBuffer; +pub(crate) type TaskTimings = VecDeque; + #[doc(hidden)] pub type GuardedTaskTimings = spin::Mutex; @@ -287,7 +288,7 @@ thread_local! { pub struct ThreadTimings { pub thread_name: Option, pub thread_id: ThreadId, - pub timings: Box, + pub timings: TaskTimings, pub total_pushed: u64, } @@ -296,10 +297,38 @@ impl ThreadTimings { ThreadTimings { thread_name, thread_id, - timings: TaskTimings::boxed(), + timings: TaskTimings::new(), total_pushed: 0, } } + + /// If this task is the same as the last task, update the end time of the last task. + /// + /// Otherwise, add the new task timing to the list. + pub fn add_task_timing(&mut self, timing: TaskTiming) { + if let Some(last_timing) = self.timings.back_mut() + && last_timing.location == timing.location + && last_timing.start == timing.start + { + last_timing.end = timing.end; + } else { + while self.timings.len() + 1 > MAX_TASK_TIMINGS { + // This should only ever pop one element because it matches the insertion below. + self.timings.pop_front(); + } + self.timings.push_back(timing); + self.total_pushed += 1; + } + } + + pub fn get_thread_task_timings(&self) -> ThreadTaskTimings { + ThreadTaskTimings { + thread_name: self.thread_name.clone(), + thread_id: self.thread_id, + timings: self.timings.iter().cloned().collect(), + total_pushed: self.total_pushed, + } + } } impl Drop for ThreadTimings { @@ -318,19 +347,13 @@ impl Drop for ThreadTimings { } #[doc(hidden)] -#[allow(dead_code)] // Used by Linux and Windows dispatchers, not macOS pub fn add_task_timing(timing: TaskTiming) { THREAD_TIMINGS.with(|timings| { - let mut timings = timings.lock(); - - if let Some(last_timing) = timings.timings.back_mut() { - if last_timing.location == timing.location && last_timing.start == timing.start { - last_timing.end = timing.end; - return; - } - } - - timings.timings.push_back(timing); - timings.total_pushed += 1; + timings.lock().add_task_timing(timing); }); } + +#[doc(hidden)] +pub fn get_current_thread_task_timings() -> ThreadTaskTimings { + THREAD_TIMINGS.with(|timings| timings.lock().get_thread_task_timings()) +} diff --git a/crates/gpui_linux/src/linux/dispatcher.rs b/crates/gpui_linux/src/linux/dispatcher.rs index a72276cc7658a399505fa62bd2d5fe7b41e43e14..22df5799ddf9c77bfdbc7b09accbea117de6d130 100644 --- a/crates/gpui_linux/src/linux/dispatcher.rs +++ b/crates/gpui_linux/src/linux/dispatcher.rs @@ -13,7 +13,7 @@ use std::{ use gpui::{ GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, PriorityQueueReceiver, - PriorityQueueSender, RunnableVariant, THREAD_TIMINGS, TaskTiming, ThreadTaskTimings, profiler, + PriorityQueueSender, RunnableVariant, TaskTiming, ThreadTaskTimings, profiler, }; struct TimerAfter { @@ -135,25 +135,7 @@ impl PlatformDispatcher for LinuxDispatcher { } fn get_current_thread_timings(&self) -> gpui::ThreadTaskTimings { - THREAD_TIMINGS.with(|timings| { - let timings = timings.lock(); - let thread_name = timings.thread_name.clone(); - let total_pushed = timings.total_pushed; - let timings = &timings.timings; - - let mut vec = Vec::with_capacity(timings.len()); - - let (s1, s2) = timings.as_slices(); - vec.extend_from_slice(s1); - vec.extend_from_slice(s2); - - gpui::ThreadTaskTimings { - thread_name, - thread_id: std::thread::current().id(), - timings: vec, - total_pushed, - } - }) + gpui::profiler::get_current_thread_task_timings() } fn is_main_thread(&self) -> bool { diff --git a/crates/gpui_macos/src/dispatcher.rs b/crates/gpui_macos/src/dispatcher.rs index dd6f546f68b88efe6babc13e2d923d634eff5825..f4b80ec7cbaf6deeebad1f7b6448463c9e132afe 100644 --- a/crates/gpui_macos/src/dispatcher.rs +++ b/crates/gpui_macos/src/dispatcher.rs @@ -1,7 +1,7 @@ use dispatch2::{DispatchQueue, DispatchQueueGlobalPriority, DispatchTime, GlobalQueueIdentifier}; use gpui::{ - GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, RunnableMeta, RunnableVariant, - THREAD_TIMINGS, TaskTiming, ThreadTaskTimings, + GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, RunnableMeta, RunnableVariant, TaskTiming, + ThreadTaskTimings, add_task_timing, }; use mach2::{ kern_return::KERN_SUCCESS, @@ -42,25 +42,7 @@ impl PlatformDispatcher for MacDispatcher { } fn get_current_thread_timings(&self) -> ThreadTaskTimings { - THREAD_TIMINGS.with(|timings| { - let timings = timings.lock(); - let thread_name = timings.thread_name.clone(); - let total_pushed = timings.total_pushed; - let timings = &timings.timings; - - let mut vec = Vec::with_capacity(timings.len()); - - let (s1, s2) = timings.as_slices(); - vec.extend_from_slice(s1); - vec.extend_from_slice(s2); - - ThreadTaskTimings { - thread_name, - thread_id: std::thread::current().id(), - timings: vec, - total_pushed, - } - }) + gpui::profiler::get_current_thread_task_timings() } fn is_main_thread(&self) -> bool { @@ -204,33 +186,16 @@ extern "C" fn trampoline(context: *mut c_void) { let location = runnable.metadata().location; let start = Instant::now(); - let timing = TaskTiming { + let mut timing = TaskTiming { location, start, end: None, }; - THREAD_TIMINGS.with(|timings| { - let mut timings = timings.lock(); - let timings = &mut timings.timings; - if let Some(last_timing) = timings.iter_mut().rev().next() { - if last_timing.location == timing.location { - return; - } - } - - timings.push_back(timing); - }); + add_task_timing(timing); runnable.run(); - let end = Instant::now(); - THREAD_TIMINGS.with(|timings| { - let mut timings = timings.lock(); - let timings = &mut timings.timings; - let Some(last_timing) = timings.iter_mut().rev().next() else { - return; - }; - last_timing.end = Some(end); - }); + timing.end = Some(Instant::now()); + add_task_timing(timing); } diff --git a/crates/gpui_windows/src/dispatcher.rs b/crates/gpui_windows/src/dispatcher.rs index a5cfd9dc10d9afcce9580565943c28cb83dc9dab..60b9898cef3076fa64898ebcb7223616150bf01b 100644 --- a/crates/gpui_windows/src/dispatcher.rs +++ b/crates/gpui_windows/src/dispatcher.rs @@ -24,7 +24,7 @@ use windows::{ use crate::{HWND, SafeHwnd, WM_GPUI_TASK_DISPATCHED_ON_MAIN_THREAD}; use gpui::{ GLOBAL_THREAD_TIMINGS, PlatformDispatcher, Priority, PriorityQueueSender, RunnableVariant, - THREAD_TIMINGS, TaskTiming, ThreadTaskTimings, TimerResolutionGuard, + TaskTiming, ThreadTaskTimings, TimerResolutionGuard, }; pub(crate) struct WindowsDispatcher { @@ -106,25 +106,7 @@ impl PlatformDispatcher for WindowsDispatcher { } fn get_current_thread_timings(&self) -> gpui::ThreadTaskTimings { - THREAD_TIMINGS.with(|timings| { - let timings = timings.lock(); - let thread_name = timings.thread_name.clone(); - let total_pushed = timings.total_pushed; - let timings = &timings.timings; - - let mut vec = Vec::with_capacity(timings.len()); - - let (s1, s2) = timings.as_slices(); - vec.extend_from_slice(s1); - vec.extend_from_slice(s2); - - gpui::ThreadTaskTimings { - thread_name, - thread_id: std::thread::current().id(), - timings: vec, - total_pushed, - } - }) + gpui::profiler::get_current_thread_task_timings() } fn is_main_thread(&self) -> bool {