crashes.rs

  1use crash_handler::{CrashEventResult, CrashHandler};
  2use log::info;
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6
  7#[cfg(target_os = "macos")]
  8use std::sync::atomic::AtomicU32;
  9use std::{
 10    env,
 11    fs::{self, File},
 12    io,
 13    panic::{self, PanicHookInfo},
 14    path::{Path, PathBuf},
 15    process::{self, Command},
 16    sync::{
 17        Arc, OnceLock,
 18        atomic::{AtomicBool, Ordering},
 19    },
 20    thread,
 21    time::Duration,
 22};
 23
 24// set once the crash handler has initialized and the client has connected to it
 25pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 26// set when the first minidump request is made to avoid generating duplicate crash reports
 27pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 28const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 29const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 30
 31#[cfg(target_os = "macos")]
 32static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 33
 34pub async fn init(crash_init: InitCrashHandler) {
 35    if *RELEASE_CHANNEL == ReleaseChannel::Dev && env::var("ZED_GENERATE_MINIDUMPS").is_err() {
 36        let old_hook = panic::take_hook();
 37        panic::set_hook(Box::new(move |info| {
 38            unsafe { env::set_var("RUST_BACKTRACE", "1") };
 39            old_hook(info);
 40            // prevent the macOS crash dialog from popping up
 41            std::process::exit(1);
 42        }));
 43        return;
 44    } else {
 45        panic::set_hook(Box::new(panic_hook));
 46    }
 47
 48    let exe = env::current_exe().expect("unable to find ourselves");
 49    let zed_pid = process::id();
 50    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 51    // but for now we append the PID of the current process which makes it unique per remote
 52    // server or interactive zed instance. This solves an issue where occasionally the socket
 53    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 54    // system and block further attempts to initialize crash handlers with that socket path.
 55    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 56    #[allow(unused)]
 57    let server_pid = Command::new(exe)
 58        .arg("--crash-handler")
 59        .arg(&socket_name)
 60        .spawn()
 61        .expect("unable to spawn server process")
 62        .id();
 63    info!("spawning crash handler process");
 64
 65    let mut elapsed = Duration::ZERO;
 66    let retry_frequency = Duration::from_millis(100);
 67    let mut maybe_client = None;
 68    while maybe_client.is_none() {
 69        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 70            maybe_client = Some(client);
 71            info!("connected to crash handler process after {elapsed:?}");
 72            break;
 73        }
 74        elapsed += retry_frequency;
 75        smol::Timer::after(retry_frequency).await;
 76    }
 77    let client = maybe_client.unwrap();
 78    client
 79        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
 80        .unwrap();
 81
 82    let client = Arc::new(client);
 83    let handler = CrashHandler::attach(unsafe {
 84        let client = client.clone();
 85        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
 86            // only request a minidump once
 87            let res = if REQUESTED_MINIDUMP
 88                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
 89                .is_ok()
 90            {
 91                #[cfg(target_os = "macos")]
 92                suspend_all_other_threads();
 93
 94                client.ping().unwrap();
 95                client.request_dump(crash_context).is_ok()
 96            } else {
 97                true
 98            };
 99            CrashEventResult::Handled(res)
100        })
101    })
102    .expect("failed to attach signal handler");
103
104    #[cfg(target_os = "linux")]
105    {
106        handler.set_ptracer(Some(server_pid));
107    }
108    CRASH_HANDLER.set(client.clone()).ok();
109    std::mem::forget(handler);
110    info!("crash handler registered");
111
112    loop {
113        client.ping().ok();
114        smol::Timer::after(Duration::from_secs(10)).await;
115    }
116}
117
118#[cfg(target_os = "macos")]
119unsafe fn suspend_all_other_threads() {
120    let task = unsafe { mach2::traps::current_task() };
121    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
122    let mut count = 0;
123    unsafe {
124        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
125    }
126    let current = unsafe { mach2::mach_init::mach_thread_self() };
127    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
128    for i in 0..count {
129        let t = unsafe { *threads.add(i as usize) };
130        if t != current && t != panic_thread {
131            unsafe { mach2::thread_act::thread_suspend(t) };
132        }
133    }
134}
135
136pub struct CrashServer {
137    initialization_params: OnceLock<InitCrashHandler>,
138    panic_info: OnceLock<CrashPanic>,
139    active_gpu: OnceLock<system_specs::GpuSpecs>,
140    has_connection: Arc<AtomicBool>,
141}
142
143#[derive(Debug, Deserialize, Serialize, Clone)]
144pub struct CrashInfo {
145    pub init: InitCrashHandler,
146    pub panic: Option<CrashPanic>,
147    pub minidump_error: Option<String>,
148    pub gpus: Vec<system_specs::GpuInfo>,
149    pub active_gpu: Option<system_specs::GpuSpecs>,
150}
151
152#[derive(Debug, Deserialize, Serialize, Clone)]
153pub struct InitCrashHandler {
154    pub session_id: String,
155    pub zed_version: String,
156    pub release_channel: String,
157    pub commit_sha: String,
158}
159
160#[derive(Deserialize, Serialize, Debug, Clone)]
161pub struct CrashPanic {
162    pub message: String,
163    pub span: String,
164}
165
166impl minidumper::ServerHandler for CrashServer {
167    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
168        let err_message = "Missing initialization data";
169        let dump_path = paths::logs_dir()
170            .join(
171                &self
172                    .initialization_params
173                    .get()
174                    .expect(err_message)
175                    .session_id,
176            )
177            .with_extension("dmp");
178        let file = File::create(&dump_path)?;
179        Ok((file, dump_path))
180    }
181
182    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
183        let minidump_error = match result {
184            Ok(MinidumpBinary { mut file, path, .. }) => {
185                use io::Write;
186                file.flush().ok();
187                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
188                drop(file);
189                let original_file = File::open(&path).unwrap();
190                let compressed_path = path.with_extension("zstd");
191                let compressed_file = File::create(&compressed_path).unwrap();
192                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
193                fs::rename(&compressed_path, path).unwrap();
194                None
195            }
196            Err(e) => Some(format!("{e:?}")),
197        };
198
199        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
200        let gpus = vec![];
201
202        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
203        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
204            Ok(gpus) => gpus,
205            Err(err) => {
206                log::warn!("Failed to collect GPU information for crash report: {err}");
207                vec![]
208            }
209        };
210
211        let crash_info = CrashInfo {
212            init: self
213                .initialization_params
214                .get()
215                .expect("not initialized")
216                .clone(),
217            panic: self.panic_info.get().cloned(),
218            minidump_error,
219            active_gpu: self.active_gpu.get().cloned(),
220            gpus,
221        };
222
223        let crash_data_path = paths::logs_dir()
224            .join(&crash_info.init.session_id)
225            .with_extension("json");
226
227        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
228
229        LoopAction::Exit
230    }
231
232    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
233        match kind {
234            1 => {
235                let init_data =
236                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
237                self.initialization_params
238                    .set(init_data)
239                    .expect("already initialized");
240            }
241            2 => {
242                let panic_data =
243                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
244                self.panic_info.set(panic_data).expect("already panicked");
245            }
246            3 => {
247                let gpu_specs: system_specs::GpuSpecs =
248                    bincode::deserialize(&buffer).expect("gpu specs");
249                self.active_gpu
250                    .set(gpu_specs)
251                    .expect("already set active gpu");
252            }
253            _ => {
254                panic!("invalid message kind");
255            }
256        }
257    }
258
259    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
260        LoopAction::Exit
261    }
262
263    fn on_client_connected(&self, _clients: usize) -> LoopAction {
264        self.has_connection.store(true, Ordering::SeqCst);
265        LoopAction::Continue
266    }
267}
268
269pub fn panic_hook(info: &PanicHookInfo) {
270    let message = info
271        .payload()
272        .downcast_ref::<&str>()
273        .map(|s| s.to_string())
274        .or_else(|| info.payload().downcast_ref::<String>().cloned())
275        .unwrap_or_else(|| "Box<Any>".to_string());
276
277    let span = info
278        .location()
279        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
280        .unwrap_or_default();
281
282    // wait 500ms for the crash handler process to start up
283    // if it's still not there just write panic info and no minidump
284    let retry_frequency = Duration::from_millis(100);
285    for _ in 0..5 {
286        if let Some(client) = CRASH_HANDLER.get() {
287            client
288                .send_message(
289                    2,
290                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
291                )
292                .ok();
293            log::error!("triggering a crash to generate a minidump...");
294
295            #[cfg(target_os = "macos")]
296            PANIC_THREAD_ID.store(
297                unsafe { mach2::mach_init::mach_thread_self() },
298                Ordering::SeqCst,
299            );
300
301            cfg_if::cfg_if! {
302                if #[cfg(target_os = "windows")] {
303                    // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
304                    CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
305                    break;
306                } else {
307                    std::process::abort();
308                }
309            }
310        }
311        thread::sleep(retry_frequency);
312    }
313}
314
315pub fn crash_server(socket: &Path) {
316    let Ok(mut server) = minidumper::Server::with_name(socket) else {
317        log::info!("Couldn't create socket, there may already be a running crash server");
318        return;
319    };
320
321    let shutdown = Arc::new(AtomicBool::new(false));
322    let has_connection = Arc::new(AtomicBool::new(false));
323
324    std::thread::spawn({
325        let shutdown = shutdown.clone();
326        let has_connection = has_connection.clone();
327        move || {
328            std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
329            if !has_connection.load(Ordering::SeqCst) {
330                shutdown.store(true, Ordering::SeqCst);
331            }
332        }
333    });
334
335    server
336        .run(
337            Box::new(CrashServer {
338                initialization_params: OnceLock::new(),
339                panic_info: OnceLock::new(),
340                has_connection,
341                active_gpu: OnceLock::new(),
342            }),
343            &shutdown,
344            Some(CRASH_HANDLER_PING_TIMEOUT),
345        )
346        .expect("failed to run server");
347}