1use crash_handler::{CrashEventResult, CrashHandler};
2use log::{error, info, warn};
3use minidumper::{Client, LoopAction, MinidumpBinary};
4use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
5use serde::{Deserialize, Serialize};
6use smol::{lock::Mutex, process::Command};
7
8#[cfg(target_os = "macos")]
9use std::sync::atomic::AtomicU32;
10use std::{
11 env,
12 fs::{self, File},
13 io,
14 panic::{self, PanicHookInfo},
15 path::{Path, PathBuf},
16 process::{self},
17 sync::{
18 Arc, OnceLock,
19 atomic::{AtomicBool, Ordering},
20 },
21 thread,
22 time::Duration,
23};
24
25// set once the crash handler has initialized and the client has connected to it
26pub static CRASH_HANDLER: OnceLock<Mutex<Client>> = OnceLock::new();
27// set when the first minidump request is made to avoid generating duplicate crash reports
28pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
29const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
30const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
31
32#[cfg(target_os = "macos")]
33static PANIC_THREAD_ID: AtomicU32 = AtomicU32::new(0);
34
35pub async fn spawn_sidecar(crash_init: InitCrashHandler) -> Client {
36 let exe = env::current_exe().expect("unable to find ourselves");
37 let zed_pid = process::id();
38 // TODO: we should be able to get away with using 1 crash-handler process per machine,
39 // but for now we append the PID of the current process which makes it unique per remote
40 // server or interactive zed instance. This solves an issue where occasionally the socket
41 // used by the crash handler isn't destroyed correctly which causes it to stay on the file
42 // system and block further attempts to initialize crash handlers with that socket path.
43 let socket_name = paths::temp_dir().join(format!("zed-crash-handler-{zed_pid}"));
44 let crash_handler = Command::new(exe)
45 .arg("--crash-handler")
46 .arg(&socket_name)
47 .spawn()
48 .expect("unable to spawn server process");
49
50 let server_pid = crash_handler.id();
51 info!("spawned crash handler process with pid: {server_pid}");
52
53 let mut elapsed = Duration::ZERO;
54 let retry_frequency = Duration::from_millis(100);
55 let mut maybe_client = None;
56 while maybe_client.is_none() {
57 if let Ok(client) = Client::with_name(socket_name.as_path()) {
58 maybe_client = Some(client);
59 info!("connected to crash handler process after {elapsed:?}");
60 break;
61 }
62 elapsed += retry_frequency;
63 smol::Timer::after(retry_frequency).await;
64 }
65 let client = maybe_client.unwrap();
66 client
67 .send_message(1, serde_json::to_vec(&crash_init).unwrap())
68 .unwrap();
69 client
70}
71
72pub async fn init(crash_init: InitCrashHandler) {
73 let gen_var = match env::var("ZED_GENERATE_MINIDUMPS") {
74 Ok(v) => {
75 if v == "false" || v == "0" {
76 Some(false)
77 } else {
78 Some(true)
79 }
80 }
81 Err(_) => None,
82 };
83
84 match (gen_var, *RELEASE_CHANNEL) {
85 (Some(false), _) | (None, ReleaseChannel::Dev) => {
86 let old_hook = panic::take_hook();
87 panic::set_hook(Box::new(move |info| {
88 unsafe { env::set_var("RUST_BACKTRACE", "1") };
89 old_hook(info);
90 // prevent the macOS crash dialog from popping up
91 std::process::exit(1);
92 }));
93 return;
94 }
95 (Some(true), _) | (None, _) => {
96 panic::set_hook(Box::new(panic_hook));
97 }
98 }
99
100 CRASH_HANDLER
101 .set(Mutex::new(spawn_sidecar(crash_init.clone()).await))
102 .ok();
103
104 let handler = CrashHandler::attach(unsafe {
105 crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
106 // only request a minidump once
107 let res = if REQUESTED_MINIDUMP
108 .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
109 .is_ok()
110 {
111 let Some(mutex) = CRASH_HANDLER.get() else {
112 return false.into();
113 };
114 let Some(client) = mutex.try_lock() else {
115 return false.into();
116 };
117
118 #[cfg(target_os = "macos")]
119 suspend_all_other_threads();
120
121 // on macos this "ping" is needed to ensure that all our
122 // `client.send_message` calls have been processed before we trigger the
123 // minidump request.
124 client.ping().ok();
125 client.request_dump(crash_context).is_ok()
126 } else {
127 true
128 };
129 CrashEventResult::Handled(res)
130 })
131 })
132 .expect("failed to attach signal handler");
133
134 #[cfg(target_os = "linux")]
135 {
136 handler.set_ptracer(Some(server_pid));
137 }
138 std::mem::forget(handler);
139 info!("crash handler registered");
140
141 // This loop keeps the crash handler process alive by repeatedly messaging it, if the
142 // ping ever fails we assume the crash handler has somehow been killed and attempt to
143 // restart it.
144 loop {
145 if let Some(client) = CRASH_HANDLER.get() {
146 let mut client = client.lock().await;
147 if client.ping().is_err() {
148 warn!("failed to ping crash handler process, relaunching it now.");
149 *client = spawn_sidecar(crash_init.clone()).await;
150 }
151 }
152 smol::Timer::after(Duration::from_secs(10)).await;
153 }
154}
155
156#[cfg(target_os = "macos")]
157unsafe fn suspend_all_other_threads() {
158 let task = unsafe { mach2::traps::current_task() };
159 let mut threads: mach2::mach_types::thread_act_array_t = std::ptr::null_mut();
160 let mut count = 0;
161 unsafe {
162 mach2::task::task_threads(task, &raw mut threads, &raw mut count);
163 }
164 let current = unsafe { mach2::mach_init::mach_thread_self() };
165 let panic_thread = PANIC_THREAD_ID.load(Ordering::SeqCst);
166 for i in 0..count {
167 let t = unsafe { *threads.add(i as usize) };
168 if t != current && t != panic_thread {
169 unsafe { mach2::thread_act::thread_suspend(t) };
170 }
171 }
172}
173
174pub struct CrashServer {
175 initialization_params: OnceLock<InitCrashHandler>,
176 panic_info: OnceLock<CrashPanic>,
177 active_gpu: OnceLock<system_specs::GpuSpecs>,
178 has_connection: Arc<AtomicBool>,
179}
180
181#[derive(Debug, Deserialize, Serialize, Clone)]
182pub struct CrashInfo {
183 pub init: InitCrashHandler,
184 pub panic: Option<CrashPanic>,
185 pub minidump_error: Option<String>,
186 pub gpus: Vec<system_specs::GpuInfo>,
187 pub active_gpu: Option<system_specs::GpuSpecs>,
188}
189
190#[derive(Debug, Deserialize, Serialize, Clone)]
191pub struct InitCrashHandler {
192 pub session_id: String,
193 pub zed_version: String,
194 pub binary: String,
195 pub release_channel: String,
196 pub commit_sha: String,
197}
198
199#[derive(Deserialize, Serialize, Debug, Clone)]
200pub struct CrashPanic {
201 pub message: String,
202 pub span: String,
203}
204
205impl minidumper::ServerHandler for CrashServer {
206 fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
207 let err_message = "Missing initialization data";
208 let dump_path = paths::logs_dir()
209 .join(
210 &self
211 .initialization_params
212 .get()
213 .expect(err_message)
214 .session_id,
215 )
216 .with_extension("dmp");
217 let file = File::create(&dump_path)?;
218 Ok((file, dump_path))
219 }
220
221 fn on_minidump_created(&self, result: Result<MinidumpBinary, minidumper::Error>) -> LoopAction {
222 let minidump_error = match result {
223 Ok(MinidumpBinary { mut file, path, .. }) => {
224 use io::Write;
225 file.flush().ok();
226 // TODO: clean this up once https://github.com/EmbarkStudios/crash-handling/issues/101 is addressed
227 drop(file);
228 let original_file = File::open(&path).unwrap();
229 let compressed_path = path.with_extension("zstd");
230 let compressed_file = File::create(&compressed_path).unwrap();
231 zstd::stream::copy_encode(original_file, compressed_file, 0).ok();
232 fs::rename(&compressed_path, path).unwrap();
233 None
234 }
235 Err(e) => Some(format!("{e:?}")),
236 };
237
238 #[cfg(not(any(target_os = "linux", target_os = "freebsd")))]
239 let gpus = vec![];
240
241 #[cfg(any(target_os = "linux", target_os = "freebsd"))]
242 let gpus = match system_specs::read_gpu_info_from_sys_class_drm() {
243 Ok(gpus) => gpus,
244 Err(err) => {
245 warn!("Failed to collect GPU information for crash report: {err}");
246 vec![]
247 }
248 };
249
250 let crash_info = CrashInfo {
251 init: self
252 .initialization_params
253 .get()
254 .expect("not initialized")
255 .clone(),
256 panic: self.panic_info.get().cloned(),
257 minidump_error,
258 active_gpu: self.active_gpu.get().cloned(),
259 gpus,
260 };
261
262 let crash_data_path = paths::logs_dir()
263 .join(&crash_info.init.session_id)
264 .with_extension("json");
265
266 fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
267
268 LoopAction::Exit
269 }
270
271 fn on_message(&self, kind: u32, buffer: Vec<u8>) {
272 match kind {
273 1 => {
274 let init_data =
275 serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
276 self.initialization_params
277 .set(init_data)
278 .expect("already initialized");
279 }
280 2 => {
281 let panic_data =
282 serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
283 self.panic_info.set(panic_data).expect("already panicked");
284 }
285 3 => {
286 let gpu_specs: system_specs::GpuSpecs =
287 bincode::deserialize(&buffer).expect("gpu specs");
288 self.active_gpu
289 .set(gpu_specs)
290 .expect("already set active gpu");
291 }
292 _ => {
293 panic!("invalid message kind");
294 }
295 }
296 }
297
298 fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
299 LoopAction::Exit
300 }
301
302 fn on_client_connected(&self, _clients: usize) -> LoopAction {
303 self.has_connection.store(true, Ordering::SeqCst);
304 LoopAction::Continue
305 }
306}
307
308pub fn panic_hook(info: &PanicHookInfo) {
309 // Don't handle a panic on threads that are not relevant to the main execution.
310 if extension_host::wasm_host::IS_WASM_THREAD.with(|v| v.load(Ordering::Acquire)) {
311 return;
312 }
313
314 let message = info
315 .payload()
316 .downcast_ref::<&str>()
317 .map(|s| s.to_string())
318 .or_else(|| info.payload().downcast_ref::<String>().cloned())
319 .unwrap_or_else(|| "Box<Any>".to_string());
320
321 let span = info
322 .location()
323 .map(|loc| format!("{}:{}", loc.file(), loc.line()))
324 .unwrap_or_default();
325
326 // wait 500ms for the crash handler process to start up
327 // if it's still not there just write panic info and no minidump
328 let retry_frequency = Duration::from_millis(100);
329 for _ in 0..5 {
330 if let Some(client) = CRASH_HANDLER.get().map(|c| c.try_lock()).flatten() {
331 client
332 .send_message(
333 2,
334 serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
335 )
336 .ok();
337 error!("triggering a crash to generate a minidump...");
338
339 #[cfg(target_os = "macos")]
340 PANIC_THREAD_ID.store(
341 unsafe { mach2::mach_init::mach_thread_self() },
342 Ordering::SeqCst,
343 );
344
345 cfg_if::cfg_if! {
346 if #[cfg(target_os = "windows")] {
347 // https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
348 CrashHandler.simulate_exception(Some(234)); // (MORE_DATA_AVAILABLE)
349 break;
350 } else {
351 std::process::abort();
352 }
353 }
354 }
355 thread::sleep(retry_frequency);
356 }
357}
358
359pub fn crash_server(socket: &Path) {
360 let Ok(mut server) = minidumper::Server::with_name(socket) else {
361 info!("couldn't create socket, there may already be a running crash server");
362 return;
363 };
364
365 let shutdown = Arc::new(AtomicBool::new(false));
366 let has_connection = Arc::new(AtomicBool::new(false));
367
368 thread::Builder::new()
369 .name("CrashServerTimeout".to_owned())
370 .spawn({
371 let shutdown = shutdown.clone();
372 let has_connection = has_connection.clone();
373 move || {
374 std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
375 if !has_connection.load(Ordering::SeqCst) {
376 shutdown.store(true, Ordering::SeqCst);
377 }
378 }
379 })
380 .unwrap();
381
382 server
383 .run(
384 Box::new(CrashServer {
385 initialization_params: OnceLock::new(),
386 panic_info: OnceLock::new(),
387 has_connection,
388 active_gpu: OnceLock::new(),
389 }),
390 &shutdown,
391 Some(CRASH_HANDLER_PING_TIMEOUT),
392 )
393 .expect("failed to run server");
394}