crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use log::info;
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6
  7#[cfg(not(target_os = "windows"))]
  8use smol::process::Command;
  9
 10#[cfg(target_os = "macos")]
 11use std::sync::atomic::AtomicU32;
 12use std::{
 13    env,
 14    fs::{self, File},
 15    io,
 16    panic::{self, PanicHookInfo},
 17    path::{Path, PathBuf},
 18    process::{self},
 19    sync::{
 20        Arc, OnceLock,
 21        atomic::{AtomicBool, Ordering},
 22    },
 23    thread,
 24    time::Duration,
 25};
 26
 27// set once the crash handler has initialized and the client has connected to it
 28pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 29// set when the first minidump request is made to avoid generating duplicate crash reports
 30pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 31const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 32const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 33
 34#[cfg(target_os = "macos")]
 35static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 36
 37pub async fn init(crash_init: InitCrashHandler) {
 38    let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") {
 39        Ok(v) => {
 40            if v == "false" || v == "0" {
 41                Some(false)
 42            } else {
 43                Some(true)
 44            }
 45        }
 46        Err(_) => None,
 47    };
 48
 49    match (gen_var, *RELEASE_CHANNEL) {
 50        (Some(false), _) | (None, ReleaseChannel::Dev) => {
 51            let old_hook = panic::take_hook();
 52            panic::set_hook(Box::new(move |info| {
 53                unsafe { env::set_var("RUST_BACKTRACE", "1") };
 54                old_hook(info);
 55                // prevent the macOS crash dialog from popping up
 56                if cfg!(target_os = "macos") {
 57                    std::process::exit(1);
 58                }
 59            }));
 60            return;
 61        }
 62        _ => {
 63            panic::set_hook(Box::new(panic_hook));
 64        }
 65    }
 66
 67    let exe = env::current_exe().expect("unable to find ourselves");
 68    let zed_pid = process::id();
 69    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 70    // but for now we append the PID of the current process which makes it unique per remote
 71    // server or interactive zed instance. This solves an issue where occasionally the socket
 72    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 73    // system and block further attempts to initialize crash handlers with that socket path.
 74    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 75    #[cfg(not(target_os = "windows"))]
 76    let _crash_handler = Command::new(exe)
 77        .arg("--crash-handler")
 78        .arg(&socket_name)
 79        .spawn()
 80        .expect("unable to spawn server process");
 81
 82    #[cfg(target_os = "windows")]
 83    spawn_crash_handler_windows(&exe, &socket_name);
 84
 85    #[cfg(target_os = "linux")]
 86    let server_pid = _crash_handler.id();
 87    info!("spawning crash handler process");
 88
 89    let mut elapsed = Duration::ZERO;
 90    let retry_frequency = Duration::from_millis(100);
 91    let mut maybe_client = None;
 92    while maybe_client.is_none() {
 93        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 94            maybe_client = Some(client);
 95            info!("connected to crash handler process after {elapsed:?}");
 96            break;
 97        }
 98        elapsed += retry_frequency;
 99        // Crash reporting is called outside of gpui in the remote server right now
100        #[allow(clippy::disallowed_methods)]
101        smol::Timer::after(retry_frequency).await;
102    }
103    let client = maybe_client.unwrap();
104    client
105        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
106        .unwrap();
107
108    let client = Arc::new(client);
109    let handler = CrashHandler::attach(unsafe {
110        let client = client.clone();
111        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
112            // only request a minidump once
113            let res = if REQUESTED_MINIDUMP
114                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
115                .is_ok()
116            {
117                #[cfg(target_os = "macos")]
118                suspend_all_other_threads();
119
120                // on macos this "ping" is needed to ensure that all our
121                // `client.send_message` calls have been processed before we trigger the
122                // minidump request.
123                client.ping().ok();
124                client.request_dump(crash_context).is_ok()
125            } else {
126                true
127            };
128            CrashEventResult::Handled(res)
129        })
130    })
131    .expect("failed to attach signal handler");
132
133    #[cfg(target_os = "linux")]
134    {
135        handler.set_ptracer(Some(server_pid));
136    }
137    CRASH_HANDLER.set(client.clone()).ok();
138    std::mem::forget(handler);
139    info!("crash handler registered");
140
141    loop {
142        client.ping().ok();
143        // Crash reporting is called outside of gpui in the remote server right now
144        #[allow(clippy::disallowed_methods)]
145        smol::Timer::after(Duration::from_secs(10)).await;
146    }
147}
148
149#[cfg(target_os = "macos")]
150unsafe fn suspend_all_other_threads() {
151    let task = unsafe { mach2::traps::current_task() };
152    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
153    let mut count = 0;
154    unsafe {
155        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
156    }
157    let current = unsafe { mach2::mach_init::mach_thread_self() };
158    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
159    for i in 0..count {
160        let t = unsafe { *threads.add(i as usize) };
161        if t != current && t != panic_thread {
162            unsafe { mach2::thread_act::thread_suspend(t) };
163        }
164    }
165}
166
167pub struct CrashServer {
168    initialization_params: OnceLock<InitCrashHandler>,
169    panic_info: OnceLock<CrashPanic>,
170    active_gpu: OnceLock<system_specs::GpuSpecs>,
171    has_connection: Arc<AtomicBool>,
172}
173
174#[derive(Debug, Deserialize, Serialize, Clone)]
175pub struct CrashInfo {
176    pub init: InitCrashHandler,
177    pub panic: Option<CrashPanic>,
178    pub minidump_error: Option<String>,
179    pub gpus: Vec<system_specs::GpuInfo>,
180    pub active_gpu: Option<system_specs::GpuSpecs>,
181}
182
183#[derive(Debug, Deserialize, Serialize, Clone)]
184pub struct InitCrashHandler {
185    pub session_id: String,
186    pub zed_version: String,
187    pub binary: String,
188    pub release_channel: String,
189    pub commit_sha: String,
190}
191
192#[derive(Deserialize, Serialize, Debug, Clone)]
193pub struct CrashPanic {
194    pub message: String,
195    pub span: String,
196}
197
198impl minidumper::ServerHandler for CrashServer {
199    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
200        let err_message = "Missing initialization data";
201        let dump_path = paths::logs_dir()
202            .join(
203                &self
204                    .initialization_params
205                    .get()
206                    .expect(err_message)
207                    .session_id,
208            )
209            .with_extension("dmp");
210        let file = File::create(&dump_path)?;
211        Ok((file, dump_path))
212    }
213
214    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
215        let minidump_error = match result {
216            Ok(MinidumpBinary { mut file, path, .. }) => {
217                use io::Write;
218                file.flush().ok();
219                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
220                drop(file);
221                let original_file = File::open(&path).unwrap();
222                let compressed_path = path.with_extension("zstd");
223                let compressed_file = File::create(&compressed_path).unwrap();
224                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
225                fs::rename(&compressed_path, path).unwrap();
226                None
227            }
228            Err(e) => Some(format!("{e:?}")),
229        };
230
231        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
232        let gpus = vec![];
233
234        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
235        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
236            Ok(gpus) => gpus,
237            Err(err) => {
238                log::warn!("Failed to collect GPU information for crash report: {err}");
239                vec![]
240            }
241        };
242
243        let crash_info = CrashInfo {
244            init: self
245                .initialization_params
246                .get()
247                .expect("not initialized")
248                .clone(),
249            panic: self.panic_info.get().cloned(),
250            minidump_error,
251            active_gpu: self.active_gpu.get().cloned(),
252            gpus,
253        };
254
255        let crash_data_path = paths::logs_dir()
256            .join(&crash_info.init.session_id)
257            .with_extension("json");
258
259        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
260
261        LoopAction::Exit
262    }
263
264    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
265        match kind {
266            1 => {
267                let init_data =
268                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
269                self.initialization_params
270                    .set(init_data)
271                    .expect("already initialized");
272            }
273            2 => {
274                let panic_data =
275                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
276                self.panic_info.set(panic_data).expect("already panicked");
277            }
278            3 => {
279                let gpu_specs: system_specs::GpuSpecs =
280                    bincode::deserialize(&buffer).expect("gpu specs");
281                // we ignore the case where it was already set because this message is sent
282                // on each new window. in theory all zed windows should be using the same
283                // GPU so this is fine.
284                self.active_gpu.set(gpu_specs).ok();
285            }
286            _ => {
287                panic!("invalid message kind");
288            }
289        }
290    }
291
292    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
293        LoopAction::Exit
294    }
295
296    fn on_client_connected(&self, _clients: usize) -> LoopAction {
297        self.has_connection.store(true, Ordering::SeqCst);
298        LoopAction::Continue
299    }
300}
301
302pub fn panic_hook(info: &PanicHookInfo) {
303    // Don't handle a panic on threads that are not relevant to the main execution.
304    if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) {
305        log::error!("wasm thread panicked!");
306        return;
307    }
308
309    let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
310
311    let span = info
312        .location()
313        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
314        .unwrap_or_default();
315
316    let current_thread = std::thread::current();
317    let thread_name = current_thread.name().unwrap_or("<unnamed>");
318
319    // wait 500ms for the crash handler process to start up
320    // if it's still not there just write panic info and no minidump
321    let retry_frequency = Duration::from_millis(100);
322    for _ in 0..5 {
323        if let Some(client) = CRASH_HANDLER.get() {
324            let location = info
325                .location()
326                .map_or_else(|| "<unknown>".to_owned(), |location| location.to_string());
327            log::error!("thread '{thread_name}' panicked at {location}:\n{message}...");
328            client
329                .send_message(
330                    2,
331                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
332                )
333                .ok();
334            log::error!("triggering a crash to generate a minidump...");
335
336            #[cfg(target_os = "macos")]
337            PANIC_THREAD_ID.store(
338                unsafe { mach2::mach_init::mach_thread_self() },
339                Ordering::SeqCst,
340            );
341
342            cfg_if::cfg_if! {
343                if #[cfg(target_os = "windows")] {
344                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
345                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
346                    break;
347                } else {
348                    std::process::abort();
349                }
350            }
351        }
352        thread::sleep(retry_frequency);
353    }
354}
355
356#[cfg(target_os = "windows")]
357fn spawn_crash_handler_windows(exe: &Path, socket_name: &Path) {
358    use std::ffi::OsStr;
359    use std::iter::once;
360    use std::os::windows::ffi::OsStrExt;
361    use windows::Win32::System::Threading::{
362        CreateProcessW, PROCESS_CREATION_FLAGS, PROCESS_INFORMATION, STARTF_FORCEOFFFEEDBACK,
363        STARTUPINFOW,
364    };
365    use windows::core::PWSTR;
366
367    let mut command_line: Vec<u16> = OsStr::new(&format!(
368        "\"{}\" --crash-handler \"{}\"",
369        exe.display(),
370        socket_name.display()
371    ))
372    .encode_wide()
373    .chain(once(0))
374    .collect();
375
376    let mut startup_info = STARTUPINFOW::default();
377    startup_info.cb = std::mem::size_of::<STARTUPINFOW>() as u32;
378
379    // By default, Windows enables a "busy" cursor when a GUI application is launched.
380    // This cursor is disabled once the application starts processing window messages.
381    // Since the crash handler process doesn't process messages, this "busy" cursor stays enabled for a long time.
382    // Disable the cursor feedback to prevent this from happening.
383    startup_info.dwFlags = STARTF_FORCEOFFFEEDBACK;
384
385    let mut process_info = PROCESS_INFORMATION::default();
386
387    unsafe {
388        CreateProcessW(
389            None,
390            Some(PWSTR(command_line.as_mut_ptr())),
391            None,
392            None,
393            false,
394            PROCESS_CREATION_FLAGS(0),
395            None,
396            None,
397            &startup_info,
398            &mut process_info,
399        )
400        .expect("unable to spawn server process");
401
402        windows::Win32::Foundation::CloseHandle(process_info.hProcess).ok();
403        windows::Win32::Foundation::CloseHandle(process_info.hThread).ok();
404    }
405}
406
407pub fn crash_server(socket: &Path) {
408    let Ok(mut server) = minidumper::Server::with_name(socket) else {
409        log::info!("Couldn't create socket, there may already be a running crash server");
410        return;
411    };
412
413    let shutdown = Arc::new(AtomicBool::new(false));
414    let has_connection = Arc::new(AtomicBool::new(false));
415
416    thread::Builder::new()
417        .name("CrashServerTimeout".to_owned())
418        .spawn({
419            let shutdown = shutdown.clone();
420            let has_connection = has_connection.clone();
421            move || {
422                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
423                if !has_connection.load(Ordering::SeqCst) {
424                    shutdown.store(true, Ordering::SeqCst);
425                }
426            }
427        })
428        .unwrap();
429
430    server
431        .run(
432            Box::new(CrashServer {
433                initialization_params: OnceLock::new(),
434                panic_info: OnceLock::new(),
435                has_connection,
436                active_gpu: OnceLock::new(),
437            }),
438            &shutdown,
439            Some(CRASH_HANDLER_PING_TIMEOUT),
440        )
441        .expect("failed to run server");
442}