crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use log::info;
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6use smol::process::Command;
  7
  8#[cfg(target_os = "macos")]
  9use std::sync::atomic::AtomicU32;
 10use std::{
 11    env,
 12    fs::{self, File},
 13    io,
 14    panic::{self, PanicHookInfo},
 15    path::{Path, PathBuf},
 16    process::{self},
 17    sync::{
 18        Arc, OnceLock,
 19        atomic::{AtomicBool, Ordering},
 20    },
 21    thread,
 22    time::Duration,
 23};
 24
 25// set once the crash handler has initialized and the client has connected to it
 26pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 27// set when the first minidump request is made to avoid generating duplicate crash reports
 28pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 29const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 30const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 31
 32#[cfg(target_os = "macos")]
 33static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 34
 35pub async fn init(crash_init: InitCrashHandler) {
 36    let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") {
 37        Ok(v) => {
 38            if v == "false" || v == "0" {
 39                Some(false)
 40            } else {
 41                Some(true)
 42            }
 43        }
 44        Err(_) => None,
 45    };
 46
 47    match (gen_var, *RELEASE_CHANNEL) {
 48        (Some(false), _) | (None, ReleaseChannel::Dev) => {
 49            let old_hook = panic::take_hook();
 50            panic::set_hook(Box::new(move |info| {
 51                unsafe { env::set_var("RUST_BACKTRACE", "1") };
 52                old_hook(info);
 53                // prevent the macOS crash dialog from popping up
 54                if cfg!(target_os = "macos") {
 55                    std::process::exit(1);
 56                }
 57            }));
 58            return;
 59        }
 60        _ => {
 61            panic::set_hook(Box::new(panic_hook));
 62        }
 63    }
 64
 65    let exe = env::current_exe().expect("unable to find ourselves");
 66    let zed_pid = process::id();
 67    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 68    // but for now we append the PID of the current process which makes it unique per remote
 69    // server or interactive zed instance. This solves an issue where occasionally the socket
 70    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 71    // system and block further attempts to initialize crash handlers with that socket path.
 72    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 73    let _crash_handler = Command::new(exe)
 74        .arg("--crash-handler")
 75        .arg(&socket_name)
 76        .spawn()
 77        .expect("unable to spawn server process");
 78    #[cfg(target_os = "linux")]
 79    let server_pid = _crash_handler.id();
 80    info!("spawning crash handler process");
 81
 82    let mut elapsed = Duration::ZERO;
 83    let retry_frequency = Duration::from_millis(100);
 84    let mut maybe_client = None;
 85    while maybe_client.is_none() {
 86        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 87            maybe_client = Some(client);
 88            info!("connected to crash handler process after {elapsed:?}");
 89            break;
 90        }
 91        elapsed += retry_frequency;
 92        smol::Timer::after(retry_frequency).await;
 93    }
 94    let client = maybe_client.unwrap();
 95    client
 96        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
 97        .unwrap();
 98
 99    let client = Arc::new(client);
100    let handler = CrashHandler::attach(unsafe {
101        let client = client.clone();
102        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
103            // only request a minidump once
104            let res = if REQUESTED_MINIDUMP
105                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
106                .is_ok()
107            {
108                #[cfg(target_os = "macos")]
109                suspend_all_other_threads();
110
111                // on macos this "ping" is needed to ensure that all our
112                // `client.send_message` calls have been processed before we trigger the
113                // minidump request.
114                client.ping().ok();
115                client.request_dump(crash_context).is_ok()
116            } else {
117                true
118            };
119            CrashEventResult::Handled(res)
120        })
121    })
122    .expect("failed to attach signal handler");
123
124    #[cfg(target_os = "linux")]
125    {
126        handler.set_ptracer(Some(server_pid));
127    }
128    CRASH_HANDLER.set(client.clone()).ok();
129    std::mem::forget(handler);
130    info!("crash handler registered");
131
132    loop {
133        client.ping().ok();
134        smol::Timer::after(Duration::from_secs(10)).await;
135    }
136}
137
138#[cfg(target_os = "macos")]
139unsafe fn suspend_all_other_threads() {
140    let task = unsafe { mach2::traps::current_task() };
141    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
142    let mut count = 0;
143    unsafe {
144        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
145    }
146    let current = unsafe { mach2::mach_init::mach_thread_self() };
147    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
148    for i in 0..count {
149        let t = unsafe { *threads.add(i as usize) };
150        if t != current && t != panic_thread {
151            unsafe { mach2::thread_act::thread_suspend(t) };
152        }
153    }
154}
155
156pub struct CrashServer {
157    initialization_params: OnceLock<InitCrashHandler>,
158    panic_info: OnceLock<CrashPanic>,
159    active_gpu: OnceLock<system_specs::GpuSpecs>,
160    has_connection: Arc<AtomicBool>,
161}
162
163#[derive(Debug, Deserialize, Serialize, Clone)]
164pub struct CrashInfo {
165    pub init: InitCrashHandler,
166    pub panic: Option<CrashPanic>,
167    pub minidump_error: Option<String>,
168    pub gpus: Vec<system_specs::GpuInfo>,
169    pub active_gpu: Option<system_specs::GpuSpecs>,
170}
171
172#[derive(Debug, Deserialize, Serialize, Clone)]
173pub struct InitCrashHandler {
174    pub session_id: String,
175    pub zed_version: String,
176    pub binary: String,
177    pub release_channel: String,
178    pub commit_sha: String,
179}
180
181#[derive(Deserialize, Serialize, Debug, Clone)]
182pub struct CrashPanic {
183    pub message: String,
184    pub span: String,
185}
186
187impl minidumper::ServerHandler for CrashServer {
188    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
189        let err_message = "Missing initialization data";
190        let dump_path = paths::logs_dir()
191            .join(
192                &self
193                    .initialization_params
194                    .get()
195                    .expect(err_message)
196                    .session_id,
197            )
198            .with_extension("dmp");
199        let file = File::create(&dump_path)?;
200        Ok((file, dump_path))
201    }
202
203    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
204        let minidump_error = match result {
205            Ok(MinidumpBinary { mut file, path, .. }) => {
206                use io::Write;
207                file.flush().ok();
208                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
209                drop(file);
210                let original_file = File::open(&path).unwrap();
211                let compressed_path = path.with_extension("zstd");
212                let compressed_file = File::create(&compressed_path).unwrap();
213                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
214                fs::rename(&compressed_path, path).unwrap();
215                None
216            }
217            Err(e) => Some(format!("{e:?}")),
218        };
219
220        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
221        let gpus = vec![];
222
223        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
224        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
225            Ok(gpus) => gpus,
226            Err(err) => {
227                log::warn!("Failed to collect GPU information for crash report: {err}");
228                vec![]
229            }
230        };
231
232        let crash_info = CrashInfo {
233            init: self
234                .initialization_params
235                .get()
236                .expect("not initialized")
237                .clone(),
238            panic: self.panic_info.get().cloned(),
239            minidump_error,
240            active_gpu: self.active_gpu.get().cloned(),
241            gpus,
242        };
243
244        let crash_data_path = paths::logs_dir()
245            .join(&crash_info.init.session_id)
246            .with_extension("json");
247
248        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
249
250        LoopAction::Exit
251    }
252
253    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
254        match kind {
255            1 => {
256                let init_data =
257                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
258                self.initialization_params
259                    .set(init_data)
260                    .expect("already initialized");
261            }
262            2 => {
263                let panic_data =
264                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
265                self.panic_info.set(panic_data).expect("already panicked");
266            }
267            3 => {
268                let gpu_specs: system_specs::GpuSpecs =
269                    bincode::deserialize(&buffer).expect("gpu specs");
270                // we ignore the case where it was already set because this message is sent
271                // on each new window. in theory all zed windows should be using the same
272                // GPU so this is fine.
273                self.active_gpu.set(gpu_specs).ok();
274            }
275            _ => {
276                panic!("invalid message kind");
277            }
278        }
279    }
280
281    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
282        LoopAction::Exit
283    }
284
285    fn on_client_connected(&self, _clients: usize) -> LoopAction {
286        self.has_connection.store(true, Ordering::SeqCst);
287        LoopAction::Continue
288    }
289}
290
291pub fn panic_hook(info: &PanicHookInfo) {
292    // Don't handle a panic on threads that are not relevant to the main execution.
293    if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) {
294        log::error!("wasm thread panicked!");
295        return;
296    }
297
298    let message = info.payload_as_str().unwrap_or("Box<Any>").to_owned();
299
300    let span = info
301        .location()
302        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
303        .unwrap_or_default();
304
305    let current_thread = std::thread::current();
306    let thread_name = current_thread.name().unwrap_or("<unnamed>");
307
308    // wait 500ms for the crash handler process to start up
309    // if it's still not there just write panic info and no minidump
310    let retry_frequency = Duration::from_millis(100);
311    for _ in 0..5 {
312        if let Some(client) = CRASH_HANDLER.get() {
313            let location = info
314                .location()
315                .map_or_else(|| "<unknown>".to_owned(), |location| location.to_string());
316            log::error!("thread '{thread_name}' panicked at {location}:\n{message}...");
317            client
318                .send_message(
319                    2,
320                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
321                )
322                .ok();
323            log::error!("triggering a crash to generate a minidump...");
324
325            #[cfg(target_os = "macos")]
326            PANIC_THREAD_ID.store(
327                unsafe { mach2::mach_init::mach_thread_self() },
328                Ordering::SeqCst,
329            );
330
331            cfg_if::cfg_if! {
332                if #[cfg(target_os = "windows")] {
333                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
334                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
335                    break;
336                } else {
337                    std::process::abort();
338                }
339            }
340        }
341        thread::sleep(retry_frequency);
342    }
343}
344
345pub fn crash_server(socket: &Path) {
346    let Ok(mut server) = minidumper::Server::with_name(socket) else {
347        log::info!("Couldn't create socket, there may already be a running crash server");
348        return;
349    };
350
351    let shutdown = Arc::new(AtomicBool::new(false));
352    let has_connection = Arc::new(AtomicBool::new(false));
353
354    thread::Builder::new()
355        .name("CrashServerTimeout".to_owned())
356        .spawn({
357            let shutdown = shutdown.clone();
358            let has_connection = has_connection.clone();
359            move || {
360                std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
361                if !has_connection.load(Ordering::SeqCst) {
362                    shutdown.store(true, Ordering::SeqCst);
363                }
364            }
365        })
366        .unwrap();
367
368    server
369        .run(
370            Box::new(CrashServer {
371                initialization_params: OnceLock::new(),
372                panic_info: OnceLock::new(),
373                has_connection,
374                active_gpu: OnceLock::new(),
375            }),
376            &shutdown,
377            Some(CRASH_HANDLER_PING_TIMEOUT),
378        )
379        .expect("failed to run server");
380}