crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use futures::future::BoxFuture;
  3use log::info;
  4use minidumper::{Client, LoopAction, MinidumpBinary};
  5use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  6use serde::{Deserialize, Serialize};
  7use std::mem;
  8
  9#[cfg(not(target_os = "windows"))]
 10use smol::process::Command;
 11
 12#[cfg(target_os = "macos")]
 13use std::sync::atomic::AtomicU32;
 14use std::{
 15    env,
 16    fs::{self, File},
 17    io,
 18    panic::{self, PanicHookInfo},
 19    path::{Path, PathBuf},
 20    process::{self},
 21    sync::{
 22        Arc, OnceLock,
 23        atomic::{AtomicBool, Ordering},
 24    },
 25    thread,
 26    time::Duration,
 27};
 28
 29// set once the crash handler has initialized and the client has connected to it
 30pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 31// set when the first minidump request is made to avoid generating duplicate crash reports
 32pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 33const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 34const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 35
 36#[cfg(target_os = "macos")]
 37static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 38
 39fn should_install_crash_handler() -> bool {
 40    if let Ok(value) = env::var("ZED_GENERATE_MINIDUMPS") {
 41        return value == "true" || value == "1";
 42    }
 43
 44    if *RELEASE_CHANNEL == ReleaseChannel::Dev {
 45        return false;
 46    }
 47
 48    true
 49}
 50
 51/// Install crash signal handlers and spawn the crash-handler subprocess.
 52///
 53/// The synchronous portion (signal handlers, panic hook) runs inline.
 54/// The async keepalive task is passed to `spawn` so the caller decides
 55/// which executor to schedule it on.
 56pub fn init(crash_init: InitCrashHandler, spawn: impl FnOnce(BoxFuture<'static, ()>)) {
 57    if !should_install_crash_handler() {
 58        let old_hook = panic::take_hook();
 59        panic::set_hook(Box::new(move |info| {
 60            unsafe { env::set_var("RUST_BACKTRACE", "1") };
 61            old_hook(info);
 62            // prevent the macOS crash dialog from popping up
 63            if cfg!(target_os = "macos") {
 64                std::process::exit(1);
 65            }
 66        }));
 67        return;
 68    }
 69
 70    panic::set_hook(Box::new(panic_hook));
 71
 72    let handler = CrashHandler::attach(unsafe {
 73        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
 74            let Some(client) = CRASH_HANDLER.get() else {
 75                return CrashEventResult::Handled(false);
 76            };
 77
 78            // only request a minidump once
 79            let res = if REQUESTED_MINIDUMP
 80                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
 81                .is_ok()
 82            {
 83                #[cfg(target_os = "macos")]
 84                suspend_all_other_threads();
 85
 86                // on macos this "ping" is needed to ensure that all our
 87                // `client.send_message` calls have been processed before we trigger the
 88                // minidump request.
 89                client.ping().ok();
 90                client.request_dump(crash_context).is_ok()
 91            } else {
 92                true
 93            };
 94            CrashEventResult::Handled(res)
 95        })
 96    })
 97    .expect("failed to attach signal handler");
 98
 99    info!("crash signal handlers installed");
100
101    spawn(Box::pin(connect_and_keepalive(crash_init, handler)));
102}
103
104/// Spawn the crash-handler subprocess, connect the IPC client, and run the
105/// keepalive ping loop. Called on a background executor by [`init`].
106async fn connect_and_keepalive(crash_init: InitCrashHandler, handler: CrashHandler) {
107    let exe = env::current_exe().expect("unable to find ourselves");
108    let zed_pid = process::id();
109    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
110    #[cfg(not(target_os = "windows"))]
111    let _crash_handler = Command::new(exe)
112        .arg("--crash-handler")
113        .arg(&socket_name)
114        .spawn()
115        .expect("unable to spawn server process");
116
117    #[cfg(target_os = "windows")]
118    spawn_crash_handler_windows(&exe, &socket_name);
119
120    info!("spawning crash handler process");
121
122    let mut elapsed = Duration::ZERO;
123    let retry_frequency = Duration::from_millis(100);
124    let mut maybe_client = None;
125    while maybe_client.is_none() {
126        if let Ok(client) = Client::with_name(socket_name.as_path()) {
127            maybe_client = Some(client);
128            info!("connected to crash handler process after {elapsed:?}");
129            break;
130        }
131        elapsed += retry_frequency;
132        // Crash reporting is called outside of gpui in the remote server right now
133        #[allow(clippy::disallowed_methods)]
134        smol::Timer::after(retry_frequency).await;
135    }
136    let client = maybe_client.unwrap();
137    client
138        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
139        .unwrap();
140
141    let client = Arc::new(client);
142
143    #[cfg(target_os = "linux")]
144    handler.set_ptracer(Some(_crash_handler.id()));
145
146    // Publishing the client to the OnceLock makes it visible to the signal
147    // handler callback installed earlier.
148    CRASH_HANDLER.set(client.clone()).ok();
149    // mem::forget so that the drop is not called
150    mem::forget(handler);
151    info!("crash handler registered");
152
153    loop {
154        client.ping().ok();
155        // Crash reporting is called outside of gpui in the remote server right now
156        #[allow(clippy::disallowed_methods)]
157        smol::Timer::after(Duration::from_secs(10)).await;
158    }
159}
160
161#[cfg(target_os = "macos")]
162unsafe fn suspend_all_other_threads() {
163    let task = unsafe { mach2::traps::current_task() };
164    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
165    let mut count = 0;
166    unsafe {
167        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
168    }
169    let current = unsafe { mach2::mach_init::mach_thread_self() };
170    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
171    for i in 0..count {
172        let t = unsafe { *threads.add(i as usize) };
173        if t != current && t != panic_thread {
174            unsafe { mach2::thread_act::thread_suspend(t) };
175        }
176    }
177}
178
179pub struct CrashServer {
180    initialization_params: OnceLock<InitCrashHandler>,
181    panic_info: OnceLock<CrashPanic>,
182    active_gpu: OnceLock<system_specs::GpuSpecs>,
183    has_connection: Arc<AtomicBool>,
184}
185
186#[derive(Debug, Deserialize, Serialize, Clone)]
187pub struct CrashInfo {
188    pub init: InitCrashHandler,
189    pub panic: Option<CrashPanic>,
190    pub minidump_error: Option<String>,
191    pub gpus: Vec<system_specs::GpuInfo>,
192    pub active_gpu: Option<system_specs::GpuSpecs>,
193}
194
195#[derive(Debug, Deserialize, Serialize, Clone)]
196pub struct InitCrashHandler {
197    pub session_id: String,
198    pub zed_version: String,
199    pub binary: String,
200    pub release_channel: String,
201    pub commit_sha: String,
202}
203
204#[derive(Deserialize, Serialize, Debug, Clone)]
205pub struct CrashPanic {
206    pub message: String,
207    pub span: String,
208}
209
210impl minidumper::ServerHandler for CrashServer {
211    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
212        let err_message = "Missing initialization data";
213        let dump_path = paths::logs_dir()
214            .join(
215                &self
216                    .initialization_params
217                    .get()
218                    .expect(err_message)
219                    .session_id,
220            )
221            .with_extension("dmp");
222        let file = File::create(&dump_path)?;
223        Ok((file, dump_path))
224    }
225
226    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
227        let minidump_error = match result {
228            Ok(MinidumpBinary { mut file, path, .. }) => {
229                use io::Write;
230                file.flush().ok();
231                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
232                drop(file);
233                let original_file = File::open(&path).unwrap();
234                let compressed_path = path.with_extension("zstd");
235                let compressed_file = File::create(&compressed_path).unwrap();
236                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
237                fs::rename(&compressed_path, path).unwrap();
238                None
239            }
240            Err(e) => Some(format!("{e:?}")),
241        };
242
243        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
244        let gpus = vec![];
245
246        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
247        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
248            Ok(gpus) => gpus,
249            Err(err) => {
250                log::warn!("Failed to collect GPU information for crash report: {err}");
251                vec![]
252            }
253        };
254
255        let crash_info = CrashInfo {
256            init: self
257                .initialization_params
258                .get()
259                .expect("not initialized")
260                .clone(),
261            panic: self.panic_info.get().cloned(),
262            minidump_error,
263            active_gpu: self.active_gpu.get().cloned(),
264            gpus,
265        };
266
267        let crash_data_path = paths::logs_dir()
268            .join(&crash_info.init.session_id)
269            .with_extension("json");
270
271        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
272
273        LoopAction::Exit
274    }
275
276    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
277        match kind {
278            1 => {
279                let init_data =
280                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
281                self.initialization_params
282                    .set(init_data)
283                    .expect("already initialized");
284            }
285            2 => {
286                let panic_data =
287                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
288                self.panic_info.set(panic_data).expect("already panicked");
289            }
290            3 => {
291                let gpu_specs: system_specs::GpuSpecs =
292                    bincode::deserialize(&buffer).expect("gpu specs");
293                // we ignore the case where it was already set because this message is sent
294                // on each new window. in theory all zed windows should be using the same
295                // GPU so this is fine.
296                self.active_gpu.set(gpu_specs).ok();
297            }
298            _ => {
299                panic!("invalid message kind");
300            }
301        }
302    }
303
304    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
305        LoopAction::Exit
306    }
307
308    fn on_client_connected(&self, _clients: usize) -> LoopAction {
309        self.has_connection.store(true, Ordering::SeqCst);
310        LoopAction::Continue
311    }
312}
313
314pub fn panic_hook(info: &PanicHookInfo) {
315    let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
316
317    let span = info
318        .location()
319        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
320        .unwrap_or_default();
321
322    let current_thread = std::thread::current();
323    let thread_name = current_thread.name().unwrap_or("<unnamed>");
324
325    // wait 500ms for the crash handler process to start up
326    // if it's still not there just write panic info and no minidump
327    let retry_frequency = Duration::from_millis(100);
328    for _ in 0..5 {
329        if let Some(client) = CRASH_HANDLER.get() {
330            let location = info
331                .location()
332                .map_or_else(|| "<unknown>".to_owned(), |location| location.to_string());
333            log::error!("thread '{thread_name}' panicked at {location}:\n{message}...");
334            client
335                .send_message(
336                    2,
337                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
338                )
339                .ok();
340            log::error!("triggering a crash to generate a minidump...");
341
342            #[cfg(target_os = "macos")]
343            PANIC_THREAD_ID.store(
344                unsafe { mach2::mach_init::mach_thread_self() },
345                Ordering::SeqCst,
346            );
347
348            cfg_if::cfg_if! {
349                if #[cfg(target_os = "windows")] {
350                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
351                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
352                    break;
353                } else {
354                    std::process::abort();
355                }
356            }
357        }
358        thread::sleep(retry_frequency);
359    }
360}
361
362#[cfg(target_os = "windows")]
363fn spawn_crash_handler_windows(exe: &Path, socket_name: &Path) {
364    use std::ffi::OsStr;
365    use std::iter::once;
366    use std::os::windows::ffi::OsStrExt;
367    use windows::Win32::System::Threading::{
368        CreateProcessW, PROCESS_CREATION_FLAGS, PROCESS_INFORMATION, STARTF_FORCEOFFFEEDBACK,
369        STARTUPINFOW,
370    };
371    use windows::core::PWSTR;
372
373    let mut command_line: Vec<u16> = OsStr::new(&format!(
374        "\"{}\" --crash-handler \"{}\"",
375        exe.display(),
376        socket_name.display()
377    ))
378    .encode_wide()
379    .chain(once(0))
380    .collect();
381
382    let mut startup_info = STARTUPINFOW::default();
383    startup_info.cb = std::mem::size_of::<STARTUPINFOW>() as u32;
384
385    // By default, Windows enables a "busy" cursor when a GUI application is launched.
386    // This cursor is disabled once the application starts processing window messages.
387    // Since the crash handler process doesn't process messages, this "busy" cursor stays enabled for a long time.
388    // Disable the cursor feedback to prevent this from happening.
389    startup_info.dwFlags = STARTF_FORCEOFFFEEDBACK;
390
391    let mut process_info = PROCESS_INFORMATION::default();
392
393    unsafe {
394        CreateProcessW(
395            None,
396            Some(PWSTR(command_line.as_mut_ptr())),
397            None,
398            None,
399            false,
400            PROCESS_CREATION_FLAGS(0),
401            None,
402            None,
403            &startup_info,
404            &mut process_info,
405        )
406        .expect("unable to spawn server process");
407
408        windows::Win32::Foundation::CloseHandle(process_info.hProcess).ok();
409        windows::Win32::Foundation::CloseHandle(process_info.hThread).ok();
410    }
411}
412
413pub fn crash_server(socket: &Path) {
414    let Ok(mut server) = minidumper::Server::with_name(socket) else {
415        log::info!("Couldn't create socket, there may already be a running crash server");
416        return;
417    };
418
419    let shutdown = Arc::new(AtomicBool::new(false));
420    let has_connection = Arc::new(AtomicBool::new(false));
421
422    thread::Builder::new()
423        .name("CrashServerTimeout".to_owned())
424        .spawn({
425            let shutdown = shutdown.clone();
426            let has_connection = has_connection.clone();
427            move || {
428                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
429                if !has_connection.load(Ordering::SeqCst) {
430                    shutdown.store(true, Ordering::SeqCst);
431                }
432            }
433        })
434        .unwrap();
435
436    server
437        .run(
438            Box::new(CrashServer {
439                initialization_params: OnceLock::new(),
440                panic_info: OnceLock::new(),
441                has_connection,
442                active_gpu: OnceLock::new(),
443            }),
444            &shutdown,
445            Some(CRASH_HANDLER_PING_TIMEOUT),
446        )
447        .expect("failed to run server");
448}