crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use log::info;
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6use smol::process::Command;
  7
  8#[cfg(target_os = "macos")]
  9use std::sync::atomic::AtomicU32;
 10use std::{
 11    env,
 12    fs::{self, File},
 13    io,
 14    panic::{self, PanicHookInfo},
 15    path::{Path, PathBuf},
 16    process::{self},
 17    sync::{
 18        Arc, OnceLock,
 19        atomic::{AtomicBool, Ordering},
 20    },
 21    thread,
 22    time::Duration,
 23};
 24
 25// set once the crash handler has initialized and the client has connected to it
 26pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 27// set when the first minidump request is made to avoid generating duplicate crash reports
 28pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 29const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 30const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 31
 32#[cfg(target_os = "macos")]
 33static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 34
 35pub async fn init(crash_init: InitCrashHandler) {
 36    if *RELEASE_CHANNEL == ReleaseChannel::Dev && env::var("ZED_GENERATE_MINIDUMPS").is_err() {
 37        let old_hook = panic::take_hook();
 38        panic::set_hook(Box::new(move |info| {
 39            unsafe { env::set_var("RUST_BACKTRACE", "1") };
 40            old_hook(info);
 41            // prevent the macOS crash dialog from popping up
 42            std::process::exit(1);
 43        }));
 44        return;
 45    } else {
 46        panic::set_hook(Box::new(panic_hook));
 47    }
 48
 49    let exe = env::current_exe().expect("unable to find ourselves");
 50    let zed_pid = process::id();
 51    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 52    // but for now we append the PID of the current process which makes it unique per remote
 53    // server or interactive zed instance. This solves an issue where occasionally the socket
 54    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 55    // system and block further attempts to initialize crash handlers with that socket path.
 56    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 57    let _crash_handler = Command::new(exe)
 58        .arg("--crash-handler")
 59        .arg(&socket_name)
 60        .spawn()
 61        .expect("unable to spawn server process");
 62    #[cfg(target_os = "linux")]
 63    let server_pid = _crash_handler.id();
 64    info!("spawning crash handler process");
 65
 66    let mut elapsed = Duration::ZERO;
 67    let retry_frequency = Duration::from_millis(100);
 68    let mut maybe_client = None;
 69    while maybe_client.is_none() {
 70        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 71            maybe_client = Some(client);
 72            info!("connected to crash handler process after {elapsed:?}");
 73            break;
 74        }
 75        elapsed += retry_frequency;
 76        smol::Timer::after(retry_frequency).await;
 77    }
 78    let client = maybe_client.unwrap();
 79    client
 80        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
 81        .unwrap();
 82
 83    let client = Arc::new(client);
 84    let handler = CrashHandler::attach(unsafe {
 85        let client = client.clone();
 86        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
 87            // only request a minidump once
 88            let res = if REQUESTED_MINIDUMP
 89                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
 90                .is_ok()
 91            {
 92                #[cfg(target_os = "macos")]
 93                suspend_all_other_threads();
 94
 95                client.ping().unwrap();
 96                client.request_dump(crash_context).is_ok()
 97            } else {
 98                true
 99            };
100            CrashEventResult::Handled(res)
101        })
102    })
103    .expect("failed to attach signal handler");
104
105    #[cfg(target_os = "linux")]
106    {
107        handler.set_ptracer(Some(server_pid));
108    }
109    CRASH_HANDLER.set(client.clone()).ok();
110    std::mem::forget(handler);
111    info!("crash handler registered");
112
113    loop {
114        client.ping().ok();
115        smol::Timer::after(Duration::from_secs(10)).await;
116    }
117}
118
119#[cfg(target_os = "macos")]
120unsafe fn suspend_all_other_threads() {
121    let task = unsafe { mach2::traps::current_task() };
122    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
123    let mut count = 0;
124    unsafe {
125        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
126    }
127    let current = unsafe { mach2::mach_init::mach_thread_self() };
128    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
129    for i in 0..count {
130        let t = unsafe { *threads.add(i as usize) };
131        if t != current && t != panic_thread {
132            unsafe { mach2::thread_act::thread_suspend(t) };
133        }
134    }
135}
136
137pub struct CrashServer {
138    initialization_params: OnceLock<InitCrashHandler>,
139    panic_info: OnceLock<CrashPanic>,
140    active_gpu: OnceLock<system_specs::GpuSpecs>,
141    has_connection: Arc<AtomicBool>,
142}
143
144#[derive(Debug, Deserialize, Serialize, Clone)]
145pub struct CrashInfo {
146    pub init: InitCrashHandler,
147    pub panic: Option<CrashPanic>,
148    pub minidump_error: Option<String>,
149    pub gpus: Vec<system_specs::GpuInfo>,
150    pub active_gpu: Option<system_specs::GpuSpecs>,
151}
152
153#[derive(Debug, Deserialize, Serialize, Clone)]
154pub struct InitCrashHandler {
155    pub session_id: String,
156    pub zed_version: String,
157    pub binary: String,
158    pub release_channel: String,
159    pub commit_sha: String,
160}
161
162#[derive(Deserialize, Serialize, Debug, Clone)]
163pub struct CrashPanic {
164    pub message: String,
165    pub span: String,
166}
167
168impl minidumper::ServerHandler for CrashServer {
169    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
170        let err_message = "Missing initialization data";
171        let dump_path = paths::logs_dir()
172            .join(
173                &self
174                    .initialization_params
175                    .get()
176                    .expect(err_message)
177                    .session_id,
178            )
179            .with_extension("dmp");
180        let file = File::create(&dump_path)?;
181        Ok((file, dump_path))
182    }
183
184    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
185        let minidump_error = match result {
186            Ok(MinidumpBinary { mut file, path, .. }) => {
187                use io::Write;
188                file.flush().ok();
189                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
190                drop(file);
191                let original_file = File::open(&path).unwrap();
192                let compressed_path = path.with_extension("zstd");
193                let compressed_file = File::create(&compressed_path).unwrap();
194                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
195                fs::rename(&compressed_path, path).unwrap();
196                None
197            }
198            Err(e) => Some(format!("{e:?}")),
199        };
200
201        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
202        let gpus = vec![];
203
204        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
205        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
206            Ok(gpus) => gpus,
207            Err(err) => {
208                log::warn!("Failed to collect GPU information for crash report: {err}");
209                vec![]
210            }
211        };
212
213        let crash_info = CrashInfo {
214            init: self
215                .initialization_params
216                .get()
217                .expect("not initialized")
218                .clone(),
219            panic: self.panic_info.get().cloned(),
220            minidump_error,
221            active_gpu: self.active_gpu.get().cloned(),
222            gpus,
223        };
224
225        let crash_data_path = paths::logs_dir()
226            .join(&crash_info.init.session_id)
227            .with_extension("json");
228
229        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
230
231        LoopAction::Exit
232    }
233
234    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
235        match kind {
236            1 => {
237                let init_data =
238                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
239                self.initialization_params
240                    .set(init_data)
241                    .expect("already initialized");
242            }
243            2 => {
244                let panic_data =
245                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
246                self.panic_info.set(panic_data).expect("already panicked");
247            }
248            3 => {
249                let gpu_specs: system_specs::GpuSpecs =
250                    bincode::deserialize(&buffer).expect("gpu specs");
251                self.active_gpu
252                    .set(gpu_specs)
253                    .expect("already set active gpu");
254            }
255            _ => {
256                panic!("invalid message kind");
257            }
258        }
259    }
260
261    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
262        LoopAction::Exit
263    }
264
265    fn on_client_connected(&self, _clients: usize) -> LoopAction {
266        self.has_connection.store(true, Ordering::SeqCst);
267        LoopAction::Continue
268    }
269}
270
271pub fn panic_hook(info: &PanicHookInfo) {
272    let message = info
273        .payload()
274        .downcast_ref::<&str>()
275        .map(|s| s.to_string())
276        .or_else(|| info.payload().downcast_ref::<String>().cloned())
277        .unwrap_or_else(|| "Box<Any>".to_string());
278
279    let span = info
280        .location()
281        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
282        .unwrap_or_default();
283
284    // wait 500ms for the crash handler process to start up
285    // if it's still not there just write panic info and no minidump
286    let retry_frequency = Duration::from_millis(100);
287    for _ in 0..5 {
288        if let Some(client) = CRASH_HANDLER.get() {
289            client
290                .send_message(
291                    2,
292                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
293                )
294                .ok();
295            log::error!("triggering a crash to generate a minidump...");
296
297            #[cfg(target_os = "macos")]
298            PANIC_THREAD_ID.store(
299                unsafe { mach2::mach_init::mach_thread_self() },
300                Ordering::SeqCst,
301            );
302
303            cfg_if::cfg_if! {
304                if #[cfg(target_os = "windows")] {
305                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
306                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
307                    break;
308                } else {
309                    std::process::abort();
310                }
311            }
312        }
313        thread::sleep(retry_frequency);
314    }
315}
316
317pub fn crash_server(socket: &Path) {
318    let Ok(mut server) = minidumper::Server::with_name(socket) else {
319        log::info!("Couldn't create socket, there may already be a running crash server");
320        return;
321    };
322
323    let shutdown = Arc::new(AtomicBool::new(false));
324    let has_connection = Arc::new(AtomicBool::new(false));
325
326    thread::Builder::new()
327        .name("CrashServerTimeout".to_owned())
328        .spawn({
329            let shutdown = shutdown.clone();
330            let has_connection = has_connection.clone();
331            move || {
332                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
333                if !has_connection.load(Ordering::SeqCst) {
334                    shutdown.store(true, Ordering::SeqCst);
335                }
336            }
337        })
338        .unwrap();
339
340    server
341        .run(
342            Box::new(CrashServer {
343                initialization_params: OnceLock::new(),
344                panic_info: OnceLock::new(),
345                has_connection,
346                active_gpu: OnceLock::new(),
347            }),
348            &shutdown,
349            Some(CRASH_HANDLER_PING_TIMEOUT),
350        )
351        .expect("failed to run server");
352}