1use crash_handler::{CrashEventResult, CrashHandler};
2use futures::future::BoxFuture;
3use log::info;
4use minidumper::{Client, LoopAction, MinidumpBinary};
5use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
6use serde::{Deserialize, Serialize};
7use std::cell::Cell;
8use std::mem;
9
10#[cfg(not(target_os = "windows"))]
11use smol::process::Command;
12
13#[cfg(target_os = "macos")]
14use std::sync::atomic::AtomicU32;
15use std::{
16 env,
17 fs::{self, File},
18 io,
19 panic::{self, AssertUnwindSafe, PanicHookInfo},
20 path::{Path, PathBuf},
21 process::{self},
22 sync::{
23 Arc, OnceLock,
24 atomic::{AtomicBool, Ordering},
25 },
26 thread,
27 time::Duration,
28};
29
30thread_local! {
31 static ALLOW_UNWIND: Cell<bool> = const { Cell::new(false) };
32}
33
34/// Catch a panic as an error instead of aborting the process. Unlike plain
35/// `catch_unwind`, this bypasses the crash-reporting panic hook which would
36/// normally abort before unwinding can occur.
37///
38/// **Use sparingly.** Prefer this only for isolating third-party code
39/// that is known to panic, where you want to handle the failure gracefully
40/// instead of crashing.
41pub fn recoverable_panic<T>(closure: impl FnOnce() -> T) -> anyhow::Result<T> {
42 ALLOW_UNWIND.with(|flag| flag.set(true));
43 let result = panic::catch_unwind(AssertUnwindSafe(closure));
44 ALLOW_UNWIND.with(|flag| flag.set(false));
45 result.map_err(|payload| {
46 let message = payload
47 .downcast_ref::<&str>()
48 .map(|s| s.to_string())
49 .or_else(|| payload.downcast_ref::<String>().cloned())
50 .unwrap_or_else(|| "unknown panic".to_string());
51 anyhow::anyhow!("panic: {message}")
52 })
53}
54
55// set once the crash handler has initialized and the client has connected to it
56pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
57// set when the first minidump request is made to avoid generating duplicate crash reports
58pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
59const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
60const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
61
62#[cfg(target_os = "macos")]
63static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
64
65fn should_install_crash_handler() -> bool {
66 if let Ok(value) = env::var("ZED_GENERATE_MINIDUMPS") {
67 return value == "true" || value == "1";
68 }
69
70 if *RELEASE_CHANNEL == ReleaseChannel::Dev {
71 return false;
72 }
73
74 true
75}
76
77/// Install crash signal handlers and spawn the crash-handler subprocess.
78///
79/// The synchronous portion (signal handlers, panic hook) runs inline.
80/// The async keepalive task is passed to `spawn` so the caller decides
81/// which executor to schedule it on.
82pub fn init(crash_init: InitCrashHandler, spawn: impl FnOnce(BoxFuture<'static, ()>)) {
83 if !should_install_crash_handler() {
84 let old_hook = panic::take_hook();
85 panic::set_hook(Box::new(move |info| {
86 if ALLOW_UNWIND.with(|flag| flag.get()) {
87 return;
88 }
89 unsafe { env::set_var("RUST_BACKTRACE", "1") };
90 old_hook(info);
91 // prevent the macOS crash dialog from popping up
92 if cfg!(target_os = "macos") {
93 std::process::exit(1);
94 }
95 }));
96 return;
97 }
98
99 panic::set_hook(Box::new(panic_hook));
100
101 let handler = CrashHandler::attach(unsafe {
102 crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
103 let Some(client) = CRASH_HANDLER.get() else {
104 return CrashEventResult::Handled(false);
105 };
106
107 // only request a minidump once
108 let res = if REQUESTED_MINIDUMP
109 .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
110 .is_ok()
111 {
112 #[cfg(target_os = "macos")]
113 suspend_all_other_threads();
114
115 // on macos this "ping" is needed to ensure that all our
116 // `client.send_message` calls have been processed before we trigger the
117 // minidump request.
118 client.ping().ok();
119 client.request_dump(crash_context).is_ok()
120 } else {
121 true
122 };
123 CrashEventResult::Handled(res)
124 })
125 })
126 .expect("failed to attach signal handler");
127
128 info!("crash signal handlers installed");
129
130 spawn(Box::pin(connect_and_keepalive(crash_init, handler)));
131}
132
133/// Spawn the crash-handler subprocess, connect the IPC client, and run the
134/// keepalive ping loop. Called on a background executor by [`init`].
135async fn connect_and_keepalive(crash_init: InitCrashHandler, handler: CrashHandler) {
136 let exe = env::current_exe().expect("unable to find ourselves");
137 let zed_pid = process::id();
138 let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
139 #[cfg(not(target_os = "windows"))]
140 let _crash_handler = Command::new(exe)
141 .arg("--crash-handler")
142 .arg(&socket_name)
143 .spawn()
144 .expect("unable to spawn server process");
145
146 #[cfg(target_os = "windows")]
147 spawn_crash_handler_windows(&exe, &socket_name);
148
149 info!("spawning crash handler process");
150
151 let mut elapsed = Duration::ZERO;
152 let retry_frequency = Duration::from_millis(100);
153 let mut maybe_client = None;
154 while maybe_client.is_none() {
155 if let Ok(client) = Client::with_name(socket_name.as_path()) {
156 maybe_client = Some(client);
157 info!("connected to crash handler process after {elapsed:?}");
158 break;
159 }
160 elapsed += retry_frequency;
161 // Crash reporting is called outside of gpui in the remote server right now
162 #[allow(clippy::disallowed_methods)]
163 smol::Timer::after(retry_frequency).await;
164 }
165 let client = maybe_client.unwrap();
166 client
167 .send_message(1, serde_json::to_vec(&crash_init).unwrap())
168 .unwrap();
169
170 let client = Arc::new(client);
171
172 #[cfg(target_os = "linux")]
173 handler.set_ptracer(Some(_crash_handler.id()));
174
175 // Publishing the client to the OnceLock makes it visible to the signal
176 // handler callback installed earlier.
177 CRASH_HANDLER.set(client.clone()).ok();
178 // mem::forget so that the drop is not called
179 mem::forget(handler);
180 info!("crash handler registered");
181
182 loop {
183 client.ping().ok();
184 // Crash reporting is called outside of gpui in the remote server right now
185 #[allow(clippy::disallowed_methods)]
186 smol::Timer::after(Duration::from_secs(10)).await;
187 }
188}
189
190#[cfg(target_os = "macos")]
191unsafe fn suspend_all_other_threads() {
192 let task = unsafe { mach2::traps::current_task() };
193 let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
194 let mut count = 0;
195 unsafe {
196 mach2::task::task_threads(task, &raw mut threads, &raw mut count);
197 }
198 let current = unsafe { mach2::mach_init::mach_thread_self() };
199 let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
200 for i in 0..count {
201 let t = unsafe { *threads.add(i as usize) };
202 if t != current && t != panic_thread {
203 unsafe { mach2::thread_act::thread_suspend(t) };
204 }
205 }
206}
207
208pub struct CrashServer {
209 initialization_params: OnceLock<InitCrashHandler>,
210 panic_info: OnceLock<CrashPanic>,
211 active_gpu: OnceLock<system_specs::GpuSpecs>,
212 has_connection: Arc<AtomicBool>,
213}
214
215#[derive(Debug, Deserialize, Serialize, Clone)]
216pub struct CrashInfo {
217 pub init: InitCrashHandler,
218 pub panic: Option<CrashPanic>,
219 pub minidump_error: Option<String>,
220 pub gpus: Vec<system_specs::GpuInfo>,
221 pub active_gpu: Option<system_specs::GpuSpecs>,
222}
223
224#[derive(Debug, Deserialize, Serialize, Clone)]
225pub struct InitCrashHandler {
226 pub session_id: String,
227 pub zed_version: String,
228 pub binary: String,
229 pub release_channel: String,
230 pub commit_sha: String,
231}
232
233#[derive(Deserialize, Serialize, Debug, Clone)]
234pub struct CrashPanic {
235 pub message: String,
236 pub span: String,
237}
238
239impl minidumper::ServerHandler for CrashServer {
240 fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
241 let err_message = "Missing initialization data";
242 let dump_path = paths::logs_dir()
243 .join(
244 &self
245 .initialization_params
246 .get()
247 .expect(err_message)
248 .session_id,
249 )
250 .with_extension("dmp");
251 let file = File::create(&dump_path)?;
252 Ok((file, dump_path))
253 }
254
255 fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
256 let minidump_error = match result {
257 Ok(MinidumpBinary { mut file, path, .. }) => {
258 use io::Write;
259 file.flush().ok();
260 // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
261 drop(file);
262 let original_file = File::open(&path).unwrap();
263 let compressed_path = path.with_extension("zstd");
264 let compressed_file = File::create(&compressed_path).unwrap();
265 zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
266 fs::rename(&compressed_path, path).unwrap();
267 None
268 }
269 Err(e) => Some(format!("{e:?}")),
270 };
271
272 #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
273 let gpus = vec![];
274
275 #[cfg(any(target_os = "linux", target_os = "freebsd"))]
276 let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
277 Ok(gpus) => gpus,
278 Err(err) => {
279 log::warn!("Failed to collect GPU information for crash report: {err}");
280 vec![]
281 }
282 };
283
284 let crash_info = CrashInfo {
285 init: self
286 .initialization_params
287 .get()
288 .expect("not initialized")
289 .clone(),
290 panic: self.panic_info.get().cloned(),
291 minidump_error,
292 active_gpu: self.active_gpu.get().cloned(),
293 gpus,
294 };
295
296 let crash_data_path = paths::logs_dir()
297 .join(&crash_info.init.session_id)
298 .with_extension("json");
299
300 fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
301
302 LoopAction::Exit
303 }
304
305 fn on_message(&self, kind: u32, buffer: Vec<u8>) {
306 match kind {
307 1 => {
308 let init_data =
309 serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
310 self.initialization_params
311 .set(init_data)
312 .expect("already initialized");
313 }
314 2 => {
315 let panic_data =
316 serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
317 self.panic_info.set(panic_data).expect("already panicked");
318 }
319 3 => {
320 let gpu_specs: system_specs::GpuSpecs =
321 bincode::deserialize(&buffer).expect("gpu specs");
322 // we ignore the case where it was already set because this message is sent
323 // on each new window. in theory all zed windows should be using the same
324 // GPU so this is fine.
325 self.active_gpu.set(gpu_specs).ok();
326 }
327 _ => {
328 panic!("invalid message kind");
329 }
330 }
331 }
332
333 fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
334 LoopAction::Exit
335 }
336
337 fn on_client_connected(&self, _clients: usize) -> LoopAction {
338 self.has_connection.store(true, Ordering::SeqCst);
339 LoopAction::Continue
340 }
341}
342
343pub fn panic_hook(info: &PanicHookInfo) {
344 let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
345
346 let span = info
347 .location()
348 .map(|loc| format!("{}:{}", loc.file(), loc.line()))
349 .unwrap_or_default();
350
351 let current_thread = std::thread::current();
352 let thread_name = current_thread.name().unwrap_or("<unnamed>");
353
354 if ALLOW_UNWIND.with(|flag| flag.get()) {
355 log::error!("thread '{thread_name}' panicked at {span} (allowing unwind):\n{message}");
356 return;
357 }
358
359 // wait 500ms for the crash handler process to start up
360 // if it's still not there just write panic info and no minidump
361 let retry_frequency = Duration::from_millis(100);
362 for _ in 0..5 {
363 if let Some(client) = CRASH_HANDLER.get() {
364 let location = info
365 .location()
366 .map_or_else(|| "<unknown>".to_owned(), |location| location.to_string());
367 log::error!("thread '{thread_name}' panicked at {location}:\n{message}...");
368 client
369 .send_message(
370 2,
371 serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
372 )
373 .ok();
374 log::error!("triggering a crash to generate a minidump...");
375
376 #[cfg(target_os = "macos")]
377 PANIC_THREAD_ID.store(
378 unsafe { mach2::mach_init::mach_thread_self() },
379 Ordering::SeqCst,
380 );
381
382 cfg_if::cfg_if! {
383 if #[cfg(target_os = "windows")] {
384 // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
385 CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
386 break;
387 } else {
388 std::process::abort();
389 }
390 }
391 }
392 thread::sleep(retry_frequency);
393 }
394}
395
396#[cfg(target_os = "windows")]
397fn spawn_crash_handler_windows(exe: &Path, socket_name: &Path) {
398 use std::ffi::OsStr;
399 use std::iter::once;
400 use std::os::windows::ffi::OsStrExt;
401 use windows::Win32::System::Threading::{
402 CreateProcessW, PROCESS_CREATION_FLAGS, PROCESS_INFORMATION, STARTF_FORCEOFFFEEDBACK,
403 STARTUPINFOW,
404 };
405 use windows::core::PWSTR;
406
407 let mut command_line: Vec<u16> = OsStr::new(&format!(
408 "\"{}\" --crash-handler \"{}\"",
409 exe.display(),
410 socket_name.display()
411 ))
412 .encode_wide()
413 .chain(once(0))
414 .collect();
415
416 let mut startup_info = STARTUPINFOW::default();
417 startup_info.cb = std::mem::size_of::<STARTUPINFOW>() as u32;
418
419 // By default, Windows enables a "busy" cursor when a GUI application is launched.
420 // This cursor is disabled once the application starts processing window messages.
421 // Since the crash handler process doesn't process messages, this "busy" cursor stays enabled for a long time.
422 // Disable the cursor feedback to prevent this from happening.
423 startup_info.dwFlags = STARTF_FORCEOFFFEEDBACK;
424
425 let mut process_info = PROCESS_INFORMATION::default();
426
427 unsafe {
428 CreateProcessW(
429 None,
430 Some(PWSTR(command_line.as_mut_ptr())),
431 None,
432 None,
433 false,
434 PROCESS_CREATION_FLAGS(0),
435 None,
436 None,
437 &startup_info,
438 &mut process_info,
439 )
440 .expect("unable to spawn server process");
441
442 windows::Win32::Foundation::CloseHandle(process_info.hProcess).ok();
443 windows::Win32::Foundation::CloseHandle(process_info.hThread).ok();
444 }
445}
446
447pub fn crash_server(socket: &Path) {
448 let Ok(mut server) = minidumper::Server::with_name(socket) else {
449 log::info!("Couldn't create socket, there may already be a running crash server");
450 return;
451 };
452
453 let shutdown = Arc::new(AtomicBool::new(false));
454 let has_connection = Arc::new(AtomicBool::new(false));
455
456 thread::Builder::new()
457 .name("CrashServerTimeout".to_owned())
458 .spawn({
459 let shutdown = shutdown.clone();
460 let has_connection = has_connection.clone();
461 move || {
462 std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
463 if !has_connection.load(Ordering::SeqCst) {
464 shutdown.store(true, Ordering::SeqCst);
465 }
466 }
467 })
468 .unwrap();
469
470 server
471 .run(
472 Box::new(CrashServer {
473 initialization_params: OnceLock::new(),
474 panic_info: OnceLock::new(),
475 has_connection,
476 active_gpu: OnceLock::new(),
477 }),
478 &shutdown,
479 Some(CRASH_HANDLER_PING_TIMEOUT),
480 )
481 .expect("failed to run server");
482}