crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use log::{error, info, warn};
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6use smol::{lock::Mutex, process::Command};
  7
  8#[cfg(target_os = "macos")]
  9use std::sync::atomic::AtomicU32;
 10use std::{
 11    env,
 12    fs::{self, File},
 13    io,
 14    panic::{self, PanicHookInfo},
 15    path::{Path, PathBuf},
 16    process::{self},
 17    sync::{
 18        Arc, OnceLock,
 19        atomic::{AtomicBool, Ordering},
 20    },
 21    thread,
 22    time::Duration,
 23};
 24
 25// set once the crash handler has initialized and the client has connected to it
 26pub static CRASH_HANDLER: OnceLock<Mutex<Client>> = OnceLock::new();
 27// set when the first minidump request is made to avoid generating duplicate crash reports
 28pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 29const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 30const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 31
 32#[cfg(target_os = "macos")]
 33static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 34
 35pub async fn spawn_sidecar(crash_init: InitCrashHandler) -> Client {
 36    let exe = env::current_exe().expect("unable to find ourselves");
 37    let zed_pid = process::id();
 38    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 39    // but for now we append the PID of the current process which makes it unique per remote
 40    // server or interactive zed instance. This solves an issue where occasionally the socket
 41    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 42    // system and block further attempts to initialize crash handlers with that socket path.
 43    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 44    let crash_handler = Command::new(exe)
 45        .arg("--crash-handler")
 46        .arg(&socket_name)
 47        .spawn()
 48        .expect("unable to spawn server process");
 49
 50    let server_pid = crash_handler.id();
 51    info!("spawned crash handler process with pid: {server_pid}");
 52
 53    let mut elapsed = Duration::ZERO;
 54    let retry_frequency = Duration::from_millis(100);
 55    let mut maybe_client = None;
 56    while maybe_client.is_none() {
 57        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 58            maybe_client = Some(client);
 59            info!("connected to crash handler process after {elapsed:?}");
 60            break;
 61        }
 62        elapsed += retry_frequency;
 63        smol::Timer::after(retry_frequency).await;
 64    }
 65    let client = maybe_client.unwrap();
 66    client
 67        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
 68        .unwrap();
 69    client
 70}
 71
 72pub async fn init(crash_init: InitCrashHandler) {
 73    let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") {
 74        Ok(v) => {
 75            if v == "false" || v == "0" {
 76                Some(false)
 77            } else {
 78                Some(true)
 79            }
 80        }
 81        Err(_) => None,
 82    };
 83
 84    match (gen_var, *RELEASE_CHANNEL) {
 85        (Some(false), _) | (None, ReleaseChannel::Dev) => {
 86            let old_hook = panic::take_hook();
 87            panic::set_hook(Box::new(move |info| {
 88                unsafe { env::set_var("RUST_BACKTRACE", "1") };
 89                old_hook(info);
 90                // prevent the macOS crash dialog from popping up
 91                std::process::exit(1);
 92            }));
 93            return;
 94        }
 95        (Some(true), _) | (None, _) => {
 96            panic::set_hook(Box::new(panic_hook));
 97        }
 98    }
 99
100    CRASH_HANDLER
101        .set(Mutex::new(spawn_sidecar(crash_init.clone()).await))
102        .ok();
103
104    let handler = CrashHandler::attach(unsafe {
105        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
106            // only request a minidump once
107            let res = if REQUESTED_MINIDUMP
108                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
109                .is_ok()
110            {
111                let Some(mutex) = CRASH_HANDLER.get() else {
112                    return false.into();
113                };
114                let Some(client) = mutex.try_lock() else {
115                    return false.into();
116                };
117
118                #[cfg(target_os = "macos")]
119                suspend_all_other_threads();
120
121                // on macos this "ping" is needed to ensure that all our
122                // `client.send_message` calls have been processed before we trigger the
123                // minidump request.
124                client.ping().ok();
125                client.request_dump(crash_context).is_ok()
126            } else {
127                true
128            };
129            CrashEventResult::Handled(res)
130        })
131    })
132    .expect("failed to attach signal handler");
133
134    #[cfg(target_os = "linux")]
135    {
136        handler.set_ptracer(Some(server_pid));
137    }
138    std::mem::forget(handler);
139    info!("crash handler registered");
140
141    // This loop keeps the crash handler process alive by repeatedly messaging it, if the
142    // ping ever fails we assume the crash handler has somehow been killed and attempt to
143    // restart it.
144    loop {
145        if let Some(client) = CRASH_HANDLER.get() {
146            let mut client = client.lock().await;
147            if client.ping().is_err() {
148                warn!("failed to ping crash handler process, relaunching it now.");
149                *client = spawn_sidecar(crash_init.clone()).await;
150            }
151        }
152        smol::Timer::after(Duration::from_secs(10)).await;
153    }
154}
155
156#[cfg(target_os = "macos")]
157unsafe fn suspend_all_other_threads() {
158    let task = unsafe { mach2::traps::current_task() };
159    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
160    let mut count = 0;
161    unsafe {
162        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
163    }
164    let current = unsafe { mach2::mach_init::mach_thread_self() };
165    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
166    for i in 0..count {
167        let t = unsafe { *threads.add(i as usize) };
168        if t != current && t != panic_thread {
169            unsafe { mach2::thread_act::thread_suspend(t) };
170        }
171    }
172}
173
174pub struct CrashServer {
175    initialization_params: OnceLock<InitCrashHandler>,
176    panic_info: OnceLock<CrashPanic>,
177    active_gpu: OnceLock<system_specs::GpuSpecs>,
178    has_connection: Arc<AtomicBool>,
179}
180
181#[derive(Debug, Deserialize, Serialize, Clone)]
182pub struct CrashInfo {
183    pub init: InitCrashHandler,
184    pub panic: Option<CrashPanic>,
185    pub minidump_error: Option<String>,
186    pub gpus: Vec<system_specs::GpuInfo>,
187    pub active_gpu: Option<system_specs::GpuSpecs>,
188}
189
190#[derive(Debug, Deserialize, Serialize, Clone)]
191pub struct InitCrashHandler {
192    pub session_id: String,
193    pub zed_version: String,
194    pub binary: String,
195    pub release_channel: String,
196    pub commit_sha: String,
197}
198
199#[derive(Deserialize, Serialize, Debug, Clone)]
200pub struct CrashPanic {
201    pub message: String,
202    pub span: String,
203}
204
205impl minidumper::ServerHandler for CrashServer {
206    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
207        let err_message = "Missing initialization data";
208        let dump_path = paths::logs_dir()
209            .join(
210                &self
211                    .initialization_params
212                    .get()
213                    .expect(err_message)
214                    .session_id,
215            )
216            .with_extension("dmp");
217        let file = File::create(&dump_path)?;
218        Ok((file, dump_path))
219    }
220
221    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
222        let minidump_error = match result {
223            Ok(MinidumpBinary { mut file, path, .. }) => {
224                use io::Write;
225                file.flush().ok();
226                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
227                drop(file);
228                let original_file = File::open(&path).unwrap();
229                let compressed_path = path.with_extension("zstd");
230                let compressed_file = File::create(&compressed_path).unwrap();
231                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
232                fs::rename(&compressed_path, path).unwrap();
233                None
234            }
235            Err(e) => Some(format!("{e:?}")),
236        };
237
238        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
239        let gpus = vec![];
240
241        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
242        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
243            Ok(gpus) => gpus,
244            Err(err) => {
245                warn!("Failed to collect GPU information for crash report: {err}");
246                vec![]
247            }
248        };
249
250        let crash_info = CrashInfo {
251            init: self
252                .initialization_params
253                .get()
254                .expect("not initialized")
255                .clone(),
256            panic: self.panic_info.get().cloned(),
257            minidump_error,
258            active_gpu: self.active_gpu.get().cloned(),
259            gpus,
260        };
261
262        let crash_data_path = paths::logs_dir()
263            .join(&crash_info.init.session_id)
264            .with_extension("json");
265
266        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
267
268        LoopAction::Exit
269    }
270
271    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
272        match kind {
273            1 => {
274                let init_data =
275                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
276                self.initialization_params
277                    .set(init_data)
278                    .expect("already initialized");
279            }
280            2 => {
281                let panic_data =
282                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
283                self.panic_info.set(panic_data).expect("already panicked");
284            }
285            3 => {
286                let gpu_specs: system_specs::GpuSpecs =
287                    bincode::deserialize(&buffer).expect("gpu specs");
288                self.active_gpu
289                    .set(gpu_specs)
290                    .expect("already set active gpu");
291            }
292            _ => {
293                panic!("invalid message kind");
294            }
295        }
296    }
297
298    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
299        LoopAction::Exit
300    }
301
302    fn on_client_connected(&self, _clients: usize) -> LoopAction {
303        self.has_connection.store(true, Ordering::SeqCst);
304        LoopAction::Continue
305    }
306}
307
308pub fn panic_hook(info: &PanicHookInfo) {
309    // Don't handle a panic on threads that are not relevant to the main execution.
310    if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) {
311        return;
312    }
313
314    let message = info
315        .payload()
316        .downcast_ref::<&str>()
317        .map(|s| s.to_string())
318        .or_else(|| info.payload().downcast_ref::<String>().cloned())
319        .unwrap_or_else(|| "Box<Any>".to_string());
320
321    let span = info
322        .location()
323        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
324        .unwrap_or_default();
325
326    // wait 500ms for the crash handler process to start up
327    // if it's still not there just write panic info and no minidump
328    let retry_frequency = Duration::from_millis(100);
329    for _ in 0..5 {
330        if let Some(client) = CRASH_HANDLER.get().map(|c| c.try_lock()).flatten() {
331            client
332                .send_message(
333                    2,
334                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
335                )
336                .ok();
337            error!("triggering a crash to generate a minidump...");
338
339            #[cfg(target_os = "macos")]
340            PANIC_THREAD_ID.store(
341                unsafe { mach2::mach_init::mach_thread_self() },
342                Ordering::SeqCst,
343            );
344
345            cfg_if::cfg_if! {
346                if #[cfg(target_os = "windows")] {
347                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
348                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
349                    break;
350                } else {
351                    std::process::abort();
352                }
353            }
354        }
355        thread::sleep(retry_frequency);
356    }
357}
358
359pub fn crash_server(socket: &Path) {
360    let Ok(mut server) = minidumper::Server::with_name(socket) else {
361        info!("couldn't create socket, there may already be a running crash server");
362        return;
363    };
364
365    let shutdown = Arc::new(AtomicBool::new(false));
366    let has_connection = Arc::new(AtomicBool::new(false));
367
368    thread::Builder::new()
369        .name("CrashServerTimeout".to_owned())
370        .spawn({
371            let shutdown = shutdown.clone();
372            let has_connection = has_connection.clone();
373            move || {
374                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
375                if !has_connection.load(Ordering::SeqCst) {
376                    shutdown.store(true, Ordering::SeqCst);
377                }
378            }
379        })
380        .unwrap();
381
382    server
383        .run(
384            Box::new(CrashServer {
385                initialization_params: OnceLock::new(),
386                panic_info: OnceLock::new(),
387                has_connection,
388                active_gpu: OnceLock::new(),
389            }),
390            &shutdown,
391            Some(CRASH_HANDLER_PING_TIMEOUT),
392        )
393        .expect("failed to run server");
394}