crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use futures::future::BoxFuture;
  3use log::info;
  4use minidumper::{Client, LoopAction, MinidumpBinary};
  5use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  6use serde::{Deserialize, Serialize};
  7use std::cell::Cell;
  8use std::mem;
  9
 10#[cfg(not(target_os = "windows"))]
 11use smol::process::Command;
 12
 13#[cfg(target_os = "macos")]
 14use std::sync::atomic::AtomicU32;
 15use std::{
 16    env,
 17    fs::{self, File},
 18    io,
 19    panic::{self, AssertUnwindSafe, PanicHookInfo},
 20    path::{Path, PathBuf},
 21    process::{self},
 22    sync::{
 23        Arc, OnceLock,
 24        atomic::{AtomicBool, Ordering},
 25    },
 26    thread,
 27    time::Duration,
 28};
 29
 30thread_local! {
 31    static ALLOW_UNWIND: Cell<bool> = const { Cell::new(false) };
 32}
 33
 34/// Catch a panic as an error instead of aborting the process. Unlike plain
 35/// `catch_unwind`, this bypasses the crash-reporting panic hook which would
 36/// normally abort before unwinding can occur.
 37///
 38/// **Use sparingly.** Prefer this only for isolating third-party code
 39/// that is known to panic, where you want to handle the failure gracefully
 40/// instead of crashing.
 41pub fn recoverable_panic<T>(closure: impl FnOnce() -> T) -> anyhow::Result<T> {
 42    ALLOW_UNWIND.with(|flag| flag.set(true));
 43    let result = panic::catch_unwind(AssertUnwindSafe(closure));
 44    ALLOW_UNWIND.with(|flag| flag.set(false));
 45    result.map_err(|payload| {
 46        let message = payload
 47            .downcast_ref::<&str>()
 48            .map(|s| s.to_string())
 49            .or_else(|| payload.downcast_ref::<String>().cloned())
 50            .unwrap_or_else(|| "unknown panic".to_string());
 51        anyhow::anyhow!("panic: {message}")
 52    })
 53}
 54
 55// set once the crash handler has initialized and the client has connected to it
 56pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 57// set when the first minidump request is made to avoid generating duplicate crash reports
 58pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 59const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 60const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 61
 62#[cfg(target_os = "macos")]
 63static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 64
 65fn should_install_crash_handler() -> bool {
 66    if let Ok(value) = env::var("ZED_GENERATE_MINIDUMPS") {
 67        return value == "true" || value == "1";
 68    }
 69
 70    if *RELEASE_CHANNEL == ReleaseChannel::Dev {
 71        return false;
 72    }
 73
 74    true
 75}
 76
 77/// Install crash signal handlers and spawn the crash-handler subprocess.
 78///
 79/// The synchronous portion (signal handlers, panic hook) runs inline.
 80/// The async keepalive task is passed to `spawn` so the caller decides
 81/// which executor to schedule it on.
 82pub fn init(crash_init: InitCrashHandler, spawn: impl FnOnce(BoxFuture<'static, ()>)) {
 83    if !should_install_crash_handler() {
 84        let old_hook = panic::take_hook();
 85        panic::set_hook(Box::new(move |info| {
 86            if ALLOW_UNWIND.with(|flag| flag.get()) {
 87                return;
 88            }
 89            unsafe { env::set_var("RUST_BACKTRACE", "1") };
 90            old_hook(info);
 91            // prevent the macOS crash dialog from popping up
 92            if cfg!(target_os = "macos") {
 93                std::process::exit(1);
 94            }
 95        }));
 96        return;
 97    }
 98
 99    panic::set_hook(Box::new(panic_hook));
100
101    let handler = CrashHandler::attach(unsafe {
102        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
103            let Some(client) = CRASH_HANDLER.get() else {
104                return CrashEventResult::Handled(false);
105            };
106
107            // only request a minidump once
108            let res = if REQUESTED_MINIDUMP
109                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
110                .is_ok()
111            {
112                #[cfg(target_os = "macos")]
113                suspend_all_other_threads();
114
115                // on macos this "ping" is needed to ensure that all our
116                // `client.send_message` calls have been processed before we trigger the
117                // minidump request.
118                client.ping().ok();
119                client.request_dump(crash_context).is_ok()
120            } else {
121                true
122            };
123            CrashEventResult::Handled(res)
124        })
125    })
126    .expect("failed to attach signal handler");
127
128    info!("crash signal handlers installed");
129
130    spawn(Box::pin(connect_and_keepalive(crash_init, handler)));
131}
132
133/// Spawn the crash-handler subprocess, connect the IPC client, and run the
134/// keepalive ping loop. Called on a background executor by [`init`].
135async fn connect_and_keepalive(crash_init: InitCrashHandler, handler: CrashHandler) {
136    let exe = env::current_exe().expect("unable to find ourselves");
137    let zed_pid = process::id();
138    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
139    #[cfg(not(target_os = "windows"))]
140    let _crash_handler = Command::new(exe)
141        .arg("--crash-handler")
142        .arg(&socket_name)
143        .spawn()
144        .expect("unable to spawn server process");
145
146    #[cfg(target_os = "windows")]
147    spawn_crash_handler_windows(&exe, &socket_name);
148
149    info!("spawning crash handler process");
150
151    let mut elapsed = Duration::ZERO;
152    let retry_frequency = Duration::from_millis(100);
153    let mut maybe_client = None;
154    while maybe_client.is_none() {
155        if let Ok(client) = Client::with_name(socket_name.as_path()) {
156            maybe_client = Some(client);
157            info!("connected to crash handler process after {elapsed:?}");
158            break;
159        }
160        elapsed += retry_frequency;
161        // Crash reporting is called outside of gpui in the remote server right now
162        #[allow(clippy::disallowed_methods)]
163        smol::Timer::after(retry_frequency).await;
164    }
165    let client = maybe_client.unwrap();
166    client
167        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
168        .unwrap();
169
170    let client = Arc::new(client);
171
172    #[cfg(target_os = "linux")]
173    handler.set_ptracer(Some(_crash_handler.id()));
174
175    // Publishing the client to the OnceLock makes it visible to the signal
176    // handler callback installed earlier.
177    CRASH_HANDLER.set(client.clone()).ok();
178    // mem::forget so that the drop is not called
179    mem::forget(handler);
180    info!("crash handler registered");
181
182    loop {
183        client.ping().ok();
184        // Crash reporting is called outside of gpui in the remote server right now
185        #[allow(clippy::disallowed_methods)]
186        smol::Timer::after(Duration::from_secs(10)).await;
187    }
188}
189
190#[cfg(target_os = "macos")]
191unsafe fn suspend_all_other_threads() {
192    let task = unsafe { mach2::traps::current_task() };
193    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
194    let mut count = 0;
195    unsafe {
196        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
197    }
198    let current = unsafe { mach2::mach_init::mach_thread_self() };
199    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
200    for i in 0..count {
201        let t = unsafe { *threads.add(i as usize) };
202        if t != current && t != panic_thread {
203            unsafe { mach2::thread_act::thread_suspend(t) };
204        }
205    }
206}
207
208pub struct CrashServer {
209    initialization_params: OnceLock<InitCrashHandler>,
210    panic_info: OnceLock<CrashPanic>,
211    active_gpu: OnceLock<system_specs::GpuSpecs>,
212    has_connection: Arc<AtomicBool>,
213}
214
215#[derive(Debug, Deserialize, Serialize, Clone)]
216pub struct CrashInfo {
217    pub init: InitCrashHandler,
218    pub panic: Option<CrashPanic>,
219    pub minidump_error: Option<String>,
220    pub gpus: Vec<system_specs::GpuInfo>,
221    pub active_gpu: Option<system_specs::GpuSpecs>,
222}
223
224#[derive(Debug, Deserialize, Serialize, Clone)]
225pub struct InitCrashHandler {
226    pub session_id: String,
227    pub zed_version: String,
228    pub binary: String,
229    pub release_channel: String,
230    pub commit_sha: String,
231}
232
233#[derive(Deserialize, Serialize, Debug, Clone)]
234pub struct CrashPanic {
235    pub message: String,
236    pub span: String,
237}
238
239impl minidumper::ServerHandler for CrashServer {
240    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
241        let err_message = "Missing initialization data";
242        let dump_path = paths::logs_dir()
243            .join(
244                &self
245                    .initialization_params
246                    .get()
247                    .expect(err_message)
248                    .session_id,
249            )
250            .with_extension("dmp");
251        let file = File::create(&dump_path)?;
252        Ok((file, dump_path))
253    }
254
255    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
256        let minidump_error = match result {
257            Ok(MinidumpBinary { mut file, path, .. }) => {
258                use io::Write;
259                file.flush().ok();
260                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
261                drop(file);
262                let original_file = File::open(&path).unwrap();
263                let compressed_path = path.with_extension("zstd");
264                let compressed_file = File::create(&compressed_path).unwrap();
265                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
266                fs::rename(&compressed_path, path).unwrap();
267                None
268            }
269            Err(e) => Some(format!("{e:?}")),
270        };
271
272        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
273        let gpus = vec![];
274
275        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
276        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
277            Ok(gpus) => gpus,
278            Err(err) => {
279                log::warn!("Failed to collect GPU information for crash report: {err}");
280                vec![]
281            }
282        };
283
284        let crash_info = CrashInfo {
285            init: self
286                .initialization_params
287                .get()
288                .expect("not initialized")
289                .clone(),
290            panic: self.panic_info.get().cloned(),
291            minidump_error,
292            active_gpu: self.active_gpu.get().cloned(),
293            gpus,
294        };
295
296        let crash_data_path = paths::logs_dir()
297            .join(&crash_info.init.session_id)
298            .with_extension("json");
299
300        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
301
302        LoopAction::Exit
303    }
304
305    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
306        match kind {
307            1 => {
308                let init_data =
309                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
310                self.initialization_params
311                    .set(init_data)
312                    .expect("already initialized");
313            }
314            2 => {
315                let panic_data =
316                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
317                self.panic_info.set(panic_data).expect("already panicked");
318            }
319            3 => {
320                let gpu_specs: system_specs::GpuSpecs =
321                    bincode::deserialize(&buffer).expect("gpu specs");
322                // we ignore the case where it was already set because this message is sent
323                // on each new window. in theory all zed windows should be using the same
324                // GPU so this is fine.
325                self.active_gpu.set(gpu_specs).ok();
326            }
327            _ => {
328                panic!("invalid message kind");
329            }
330        }
331    }
332
333    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
334        LoopAction::Exit
335    }
336
337    fn on_client_connected(&self, _clients: usize) -> LoopAction {
338        self.has_connection.store(true, Ordering::SeqCst);
339        LoopAction::Continue
340    }
341}
342
343pub fn panic_hook(info: &PanicHookInfo) {
344    let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
345
346    let span = info
347        .location()
348        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
349        .unwrap_or_default();
350
351    let current_thread = std::thread::current();
352    let thread_name = current_thread.name().unwrap_or("<unnamed>");
353
354    if ALLOW_UNWIND.with(|flag| flag.get()) {
355        log::error!("thread '{thread_name}' panicked at {span} (allowing unwind):\n{message}");
356        return;
357    }
358
359    // wait 500ms for the crash handler process to start up
360    // if it's still not there just write panic info and no minidump
361    let retry_frequency = Duration::from_millis(100);
362    for _ in 0..5 {
363        if let Some(client) = CRASH_HANDLER.get() {
364            let location = info
365                .location()
366                .map_or_else(|| "<unknown>".to_owned(), |location| location.to_string());
367            log::error!("thread '{thread_name}' panicked at {location}:\n{message}...");
368            client
369                .send_message(
370                    2,
371                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
372                )
373                .ok();
374            log::error!("triggering a crash to generate a minidump...");
375
376            #[cfg(target_os = "macos")]
377            PANIC_THREAD_ID.store(
378                unsafe { mach2::mach_init::mach_thread_self() },
379                Ordering::SeqCst,
380            );
381
382            cfg_if::cfg_if! {
383                if #[cfg(target_os = "windows")] {
384                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
385                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
386                    break;
387                } else {
388                    std::process::abort();
389                }
390            }
391        }
392        thread::sleep(retry_frequency);
393    }
394}
395
396#[cfg(target_os = "windows")]
397fn spawn_crash_handler_windows(exe: &Path, socket_name: &Path) {
398    use std::ffi::OsStr;
399    use std::iter::once;
400    use std::os::windows::ffi::OsStrExt;
401    use windows::Win32::System::Threading::{
402        CreateProcessW, PROCESS_CREATION_FLAGS, PROCESS_INFORMATION, STARTF_FORCEOFFFEEDBACK,
403        STARTUPINFOW,
404    };
405    use windows::core::PWSTR;
406
407    let mut command_line: Vec<u16> = OsStr::new(&format!(
408        "\"{}\" --crash-handler \"{}\"",
409        exe.display(),
410        socket_name.display()
411    ))
412    .encode_wide()
413    .chain(once(0))
414    .collect();
415
416    let mut startup_info = STARTUPINFOW::default();
417    startup_info.cb = std::mem::size_of::<STARTUPINFOW>() as u32;
418
419    // By default, Windows enables a "busy" cursor when a GUI application is launched.
420    // This cursor is disabled once the application starts processing window messages.
421    // Since the crash handler process doesn't process messages, this "busy" cursor stays enabled for a long time.
422    // Disable the cursor feedback to prevent this from happening.
423    startup_info.dwFlags = STARTF_FORCEOFFFEEDBACK;
424
425    let mut process_info = PROCESS_INFORMATION::default();
426
427    unsafe {
428        CreateProcessW(
429            None,
430            Some(PWSTR(command_line.as_mut_ptr())),
431            None,
432            None,
433            false,
434            PROCESS_CREATION_FLAGS(0),
435            None,
436            None,
437            &startup_info,
438            &mut process_info,
439        )
440        .expect("unable to spawn server process");
441
442        windows::Win32::Foundation::CloseHandle(process_info.hProcess).ok();
443        windows::Win32::Foundation::CloseHandle(process_info.hThread).ok();
444    }
445}
446
447pub fn crash_server(socket: &Path) {
448    let Ok(mut server) = minidumper::Server::with_name(socket) else {
449        log::info!("Couldn't create socket, there may already be a running crash server");
450        return;
451    };
452
453    let shutdown = Arc::new(AtomicBool::new(false));
454    let has_connection = Arc::new(AtomicBool::new(false));
455
456    thread::Builder::new()
457        .name("CrashServerTimeout".to_owned())
458        .spawn({
459            let shutdown = shutdown.clone();
460            let has_connection = has_connection.clone();
461            move || {
462                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
463                if !has_connection.load(Ordering::SeqCst) {
464                    shutdown.store(true, Ordering::SeqCst);
465                }
466            }
467        })
468        .unwrap();
469
470    server
471        .run(
472            Box::new(CrashServer {
473                initialization_params: OnceLock::new(),
474                panic_info: OnceLock::new(),
475                has_connection,
476                active_gpu: OnceLock::new(),
477            }),
478            &shutdown,
479            Some(CRASH_HANDLER_PING_TIMEOUT),
480        )
481        .expect("failed to run server");
482}