crashes.rs

  1use crash_handler::CrashHandler;
  2use log::info;
  3use minidumper::{Client, LoopAction, MinidumpBinary};
  4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
  5use serde::{Deserialize, Serialize};
  6
  7#[cfg(target_os = "macos")]
  8use std::sync::atomic::AtomicU32;
  9use std::{
 10    env,
 11    fs::{self, File},
 12    io,
 13    panic::Location,
 14    path::{Path, PathBuf},
 15    process::{self, Command},
 16    sync::{
 17        Arc, OnceLock,
 18        atomic::{AtomicBool, Ordering},
 19    },
 20    thread,
 21    time::Duration,
 22};
 23
 24// set once the crash handler has initialized and the client has connected to it
 25pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
 26// set when the first minidump request is made to avoid generating duplicate crash reports
 27pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
 28const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
 29const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
 30
 31#[cfg(target_os = "macos")]
 32static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
 33
 34pub async fn init(crash_init: InitCrashHandler) {
 35    if *RELEASE_CHANNEL == ReleaseChannel::Dev && env::var("ZED_GENERATE_MINIDUMPS").is_err() {
 36        return;
 37    }
 38
 39    let exe = env::current_exe().expect("unable to find ourselves");
 40    let zed_pid = process::id();
 41    // TODO: we should be able to get away with using 1 crash-handler process per machine,
 42    // but for now we append the PID of the current process which makes it unique per remote
 43    // server or interactive zed instance. This solves an issue where occasionally the socket
 44    // used by the crash handler isn't destroyed correctly which causes it to stay on the file
 45    // system and block further attempts to initialize crash handlers with that socket path.
 46    let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
 47    #[allow(unused)]
 48    let server_pid = Command::new(exe)
 49        .arg("--crash-handler")
 50        .arg(&socket_name)
 51        .spawn()
 52        .expect("unable to spawn server process")
 53        .id();
 54    info!("spawning crash handler process");
 55
 56    let mut elapsed = Duration::ZERO;
 57    let retry_frequency = Duration::from_millis(100);
 58    let mut maybe_client = None;
 59    while maybe_client.is_none() {
 60        if let Ok(client) = Client::with_name(socket_name.as_path()) {
 61            maybe_client = Some(client);
 62            info!("connected to crash handler process after {elapsed:?}");
 63            break;
 64        }
 65        elapsed += retry_frequency;
 66        smol::Timer::after(retry_frequency).await;
 67    }
 68    let client = maybe_client.unwrap();
 69    client
 70        .send_message(1, serde_json::to_vec(&crash_init).unwrap())
 71        .unwrap();
 72
 73    let client = Arc::new(client);
 74    let handler = crash_handler::CrashHandler::attach(unsafe {
 75        let client = client.clone();
 76        crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
 77            // only request a minidump once
 78            let res = if REQUESTED_MINIDUMP
 79                .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
 80                .is_ok()
 81            {
 82                #[cfg(target_os = "macos")]
 83                suspend_all_other_threads();
 84
 85                client.ping().unwrap();
 86                client.request_dump(crash_context).is_ok()
 87            } else {
 88                true
 89            };
 90            crash_handler::CrashEventResult::Handled(res)
 91        })
 92    })
 93    .expect("failed to attach signal handler");
 94
 95    #[cfg(target_os = "linux")]
 96    {
 97        handler.set_ptracer(Some(server_pid));
 98    }
 99    CRASH_HANDLER.set(client.clone()).ok();
100    std::mem::forget(handler);
101    info!("crash handler registered");
102
103    loop {
104        client.ping().ok();
105        smol::Timer::after(Duration::from_secs(10)).await;
106    }
107}
108
109#[cfg(target_os = "macos")]
110unsafe fn suspend_all_other_threads() {
111    let task = unsafe { mach2::traps::current_task() };
112    let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
113    let mut count = 0;
114    unsafe {
115        mach2::task::task_threads(task, &raw mut threads, &raw mut count);
116    }
117    let current = unsafe { mach2::mach_init::mach_thread_self() };
118    let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
119    for i in 0..count {
120        let t = unsafe { *threads.add(i as usize) };
121        if t != current && t != panic_thread {
122            unsafe { mach2::thread_act::thread_suspend(t) };
123        }
124    }
125}
126
127pub struct CrashServer {
128    initialization_params: OnceLock<InitCrashHandler>,
129    panic_info: OnceLock<CrashPanic>,
130    active_gpu: OnceLock<system_specs::GpuSpecs>,
131    has_connection: Arc<AtomicBool>,
132}
133
134#[derive(Debug, Deserialize, Serialize, Clone)]
135pub struct CrashInfo {
136    pub init: InitCrashHandler,
137    pub panic: Option<CrashPanic>,
138    pub minidump_error: Option<String>,
139    pub gpus: Vec<system_specs::GpuInfo>,
140    pub active_gpu: Option<system_specs::GpuSpecs>,
141}
142
143#[derive(Debug, Deserialize, Serialize, Clone)]
144pub struct InitCrashHandler {
145    pub session_id: String,
146    pub zed_version: String,
147    pub release_channel: String,
148    pub commit_sha: String,
149}
150
151#[derive(Deserialize, Serialize, Debug, Clone)]
152pub struct CrashPanic {
153    pub message: String,
154    pub span: String,
155}
156
157impl minidumper::ServerHandler for CrashServer {
158    fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
159        let err_message = "Missing initialization data";
160        let dump_path = paths::logs_dir()
161            .join(
162                &self
163                    .initialization_params
164                    .get()
165                    .expect(err_message)
166                    .session_id,
167            )
168            .with_extension("dmp");
169        let file = File::create(&dump_path)?;
170        Ok((file, dump_path))
171    }
172
173    fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
174        let minidump_error = match result {
175            Ok(MinidumpBinary { mut file, path, .. }) => {
176                use io::Write;
177                file.flush().ok();
178                // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
179                drop(file);
180                let original_file = File::open(&path).unwrap();
181                let compressed_path = path.with_extension("zstd");
182                let compressed_file = File::create(&compressed_path).unwrap();
183                zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
184                fs::rename(&compressed_path, path).unwrap();
185                None
186            }
187            Err(e) => Some(format!("{e:?}")),
188        };
189
190        #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
191        let gpus = vec![];
192
193        #[cfg(any(target_os = "linux", target_os = "freebsd"))]
194        let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
195            Ok(gpus) => gpus,
196            Err(err) => {
197                log::warn!("Failed to collect GPU information for crash report: {err}");
198                vec![]
199            }
200        };
201
202        let crash_info = CrashInfo {
203            init: self
204                .initialization_params
205                .get()
206                .expect("not initialized")
207                .clone(),
208            panic: self.panic_info.get().cloned(),
209            minidump_error,
210            active_gpu: self.active_gpu.get().cloned(),
211            gpus,
212        };
213
214        let crash_data_path = paths::logs_dir()
215            .join(&crash_info.init.session_id)
216            .with_extension("json");
217
218        fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
219
220        LoopAction::Exit
221    }
222
223    fn on_message(&self, kind: u32, buffer: Vec<u8>) {
224        match kind {
225            1 => {
226                let init_data =
227                    serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
228                self.initialization_params
229                    .set(init_data)
230                    .expect("already initialized");
231            }
232            2 => {
233                let panic_data =
234                    serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
235                self.panic_info.set(panic_data).expect("already panicked");
236            }
237            3 => {
238                let gpu_specs: system_specs::GpuSpecs =
239                    bincode::deserialize(&buffer).expect("gpu specs");
240                self.active_gpu
241                    .set(gpu_specs)
242                    .expect("already set active gpu");
243            }
244            _ => {
245                panic!("invalid message kind");
246            }
247        }
248    }
249
250    fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
251        LoopAction::Exit
252    }
253
254    fn on_client_connected(&self, _clients: usize) -> LoopAction {
255        self.has_connection.store(true, Ordering::SeqCst);
256        LoopAction::Continue
257    }
258}
259
260pub fn handle_panic(message: String, span: Option<&Location>) {
261    let span = span
262        .map(|loc| format!("{}:{}", loc.file(), loc.line()))
263        .unwrap_or_default();
264
265    // wait 500ms for the crash handler process to start up
266    // if it's still not there just write panic info and no minidump
267    let retry_frequency = Duration::from_millis(100);
268    for _ in 0..5 {
269        if let Some(client) = CRASH_HANDLER.get() {
270            client
271                .send_message(
272                    2,
273                    serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
274                )
275                .ok();
276            log::error!("triggering a crash to generate a minidump...");
277
278            #[cfg(target_os = "macos")]
279            PANIC_THREAD_ID.store(
280                unsafe { mach2::mach_init::mach_thread_self() },
281                Ordering::SeqCst,
282            );
283
284            #[cfg(target_os = "linux")]
285            CrashHandler.simulate_signal(crash_handler::Signal::Trap as u32);
286            #[cfg(not(target_os = "linux"))]
287            CrashHandler.simulate_exception(None);
288            break;
289        }
290        thread::sleep(retry_frequency);
291    }
292}
293
294pub fn crash_server(socket: &Path) {
295    let Ok(mut server) = minidumper::Server::with_name(socket) else {
296        log::info!("Couldn't create socket, there may already be a running crash server");
297        return;
298    };
299
300    let shutdown = Arc::new(AtomicBool::new(false));
301    let has_connection = Arc::new(AtomicBool::new(false));
302
303    std::thread::spawn({
304        let shutdown = shutdown.clone();
305        let has_connection = has_connection.clone();
306        move || {
307            std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
308            if !has_connection.load(Ordering::SeqCst) {
309                shutdown.store(true, Ordering::SeqCst);
310            }
311        }
312    });
313
314    server
315        .run(
316            Box::new(CrashServer {
317                initialization_params: OnceLock::new(),
318                panic_info: OnceLock::new(),
319                has_connection,
320                active_gpu: OnceLock::new(),
321            }),
322            &shutdown,
323            Some(CRASH_HANDLER_PING_TIMEOUT),
324        )
325        .expect("failed to run server");
326}