reliability.rs

  1use anyhow::{Context, Result};
  2use backtrace::{self, Backtrace};
  3use chrono::Utc;
  4use client::telemetry;
  5use db::kvp::KEY_VALUE_STORE;
  6use gpui::{AppContext, SemanticVersion};
  7use http_client::{HttpRequestExt, Method};
  8
  9use http_client::{self, HttpClient, HttpClientWithUrl};
 10use paths::{crashes_dir, crashes_retired_dir};
 11use release_channel::ReleaseChannel;
 12use release_channel::RELEASE_CHANNEL;
 13use settings::Settings;
 14use smol::stream::StreamExt;
 15use std::{
 16    env,
 17    ffi::OsStr,
 18    sync::{atomic::Ordering, Arc},
 19};
 20use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
 21use telemetry_events::LocationData;
 22use telemetry_events::Panic;
 23use telemetry_events::PanicRequest;
 24use util::ResultExt;
 25
 26use crate::stdout_is_a_pty;
 27static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 28
 29pub fn init_panic_hook(
 30    app_version: SemanticVersion,
 31    system_id: Option<String>,
 32    installation_id: Option<String>,
 33    session_id: String,
 34) {
 35    let is_pty = stdout_is_a_pty();
 36
 37    panic::set_hook(Box::new(move |info| {
 38        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 39        if prior_panic_count > 0 {
 40            // Give the panic-ing thread time to write the panic file
 41            loop {
 42                std::thread::yield_now();
 43            }
 44        }
 45
 46        let thread = thread::current();
 47        let thread_name = thread.name().unwrap_or("<unnamed>");
 48
 49        let payload = info
 50            .payload()
 51            .downcast_ref::<&str>()
 52            .map(|s| s.to_string())
 53            .or_else(|| info.payload().downcast_ref::<String>().cloned())
 54            .unwrap_or_else(|| "Box<Any>".to_string());
 55
 56        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 57            let location = info.location().unwrap();
 58            let backtrace = Backtrace::new();
 59            eprintln!(
 60                "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}",
 61                thread_name,
 62                payload,
 63                location.file(),
 64                location.line(),
 65                location.column(),
 66                backtrace,
 67            );
 68            std::process::exit(-1);
 69        }
 70
 71        let backtrace = Backtrace::new();
 72        let mut backtrace = backtrace
 73            .frames()
 74            .iter()
 75            .flat_map(|frame| {
 76                frame
 77                    .symbols()
 78                    .iter()
 79                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
 80            })
 81            .collect::<Vec<_>>();
 82
 83        // Strip out leading stack frames for rust panic-handling.
 84        if let Some(ix) = backtrace
 85            .iter()
 86            .position(|name| name == "rust_begin_unwind")
 87        {
 88            backtrace.drain(0..=ix);
 89        }
 90
 91        let panic_data = telemetry_events::Panic {
 92            thread: thread_name.into(),
 93            payload,
 94            location_data: info.location().map(|location| LocationData {
 95                file: location.file().into(),
 96                line: location.line(),
 97            }),
 98            app_version: app_version.to_string(),
 99            release_channel: RELEASE_CHANNEL.display_name().into(),
100            os_name: telemetry::os_name(),
101            os_version: Some(telemetry::os_version()),
102            architecture: env::consts::ARCH.into(),
103            panicked_on: Utc::now().timestamp_millis(),
104            backtrace,
105            system_id: system_id.clone(),
106            installation_id: installation_id.clone(),
107            session_id: session_id.clone(),
108        };
109
110        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
111            log::error!("{}", panic_data_json);
112        }
113
114        if !is_pty {
115            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
116                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
117                let panic_file_path = paths::logs_dir().join(format!("zed-{timestamp}.panic"));
118                let panic_file = std::fs::OpenOptions::new()
119                    .append(true)
120                    .create(true)
121                    .open(&panic_file_path)
122                    .log_err();
123                if let Some(mut panic_file) = panic_file {
124                    writeln!(&mut panic_file, "{panic_data_json}").log_err();
125                    panic_file.flush().log_err();
126                }
127            }
128        }
129
130        std::process::abort();
131    }));
132}
133
134pub fn init(
135    http_client: Arc<HttpClientWithUrl>,
136    installation_id: Option<String>,
137    cx: &mut AppContext,
138) {
139    #[cfg(target_os = "macos")]
140    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
141
142    upload_panics_and_crashes(http_client, installation_id, cx)
143}
144
145#[cfg(target_os = "macos")]
146pub fn monitor_main_thread_hangs(
147    http_client: Arc<HttpClientWithUrl>,
148    installation_id: Option<String>,
149    cx: &AppContext,
150) {
151    // This is too noisy to ship to stable for now.
152    if !matches!(
153        ReleaseChannel::global(cx),
154        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
155    ) {
156        return;
157    }
158
159    use nix::sys::signal::{
160        sigaction, SaFlags, SigAction, SigHandler, SigSet,
161        Signal::{self, SIGUSR2},
162    };
163
164    use parking_lot::Mutex;
165
166    use http_client::Method;
167    use std::{
168        ffi::c_int,
169        sync::{mpsc, OnceLock},
170        time::Duration,
171    };
172    use telemetry_events::{BacktraceFrame, HangReport};
173
174    use nix::sys::pthread;
175
176    let foreground_executor = cx.foreground_executor();
177    let background_executor = cx.background_executor();
178    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
179
180    // Initialize SIGUSR2 handler to send a backtrace to a channel.
181    let (backtrace_tx, backtrace_rx) = mpsc::channel();
182    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
183    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
184    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
185    BACKTRACE.lock().reserve(100);
186
187    fn handle_backtrace_signal() {
188        unsafe {
189            extern "C" fn handle_sigusr2(_i: c_int) {
190                unsafe {
191                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
192                    // which can only be triggered by This signal handler. In addition,
193                    // this signal handler is immediately removed by SA_RESETHAND, and this
194                    // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined
195                    // below
196                    let mut bt = BACKTRACE.lock();
197                    bt.clear();
198                    backtrace::trace_unsynchronized(|frame| {
199                        if bt.len() < bt.capacity() {
200                            bt.push(frame.clone());
201                            true
202                        } else {
203                            false
204                        }
205                    });
206                }
207
208                BACKTRACE_SENDER.get().unwrap().send(()).ok();
209            }
210
211            let mut mask = SigSet::empty();
212            mask.add(SIGUSR2);
213            sigaction(
214                Signal::SIGUSR2,
215                &SigAction::new(
216                    SigHandler::Handler(handle_sigusr2),
217                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
218                    mask,
219                ),
220            )
221            .log_err();
222        }
223    }
224
225    handle_backtrace_signal();
226    let main_thread = pthread::pthread_self();
227
228    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
229    foreground_executor
230        .spawn(async move { while (rx.next().await).is_some() {} })
231        .detach();
232
233    background_executor
234        .spawn({
235            let background_executor = background_executor.clone();
236            async move {
237                loop {
238                    background_executor.timer(Duration::from_secs(1)).await;
239                    match tx.try_send(()) {
240                        Ok(_) => continue,
241                        Err(e) => {
242                            if e.into_send_error().is_full() {
243                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
244                            }
245                            // Only detect the first hang
246                            break;
247                        }
248                    }
249                }
250            }
251        })
252        .detach();
253
254    let app_version = release_channel::AppVersion::global(cx);
255    let os_name = client::telemetry::os_name();
256
257    background_executor
258        .clone()
259        .spawn(async move {
260            let os_version = client::telemetry::os_version();
261
262            loop {
263                while backtrace_rx.recv().is_ok() {
264                    if !telemetry_settings.diagnostics {
265                        return;
266                    }
267
268                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
269                    // the backtrace transmitter has fired, which itself is only done
270                    // by the signal handler. And due to SA_RESETHAND  the signal handler
271                    // will not run again until `handle_backtrace_signal` is called.
272                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
273                    let backtrace: Vec<_> = raw_backtrace
274                        .into_iter()
275                        .map(|frame| {
276                            let mut btf = BacktraceFrame {
277                                ip: frame.ip() as usize,
278                                symbol_addr: frame.symbol_address() as usize,
279                                base: frame.module_base_address().map(|addr| addr as usize),
280                                symbols: vec![],
281                            };
282
283                            backtrace::resolve_frame(&frame, |symbol| {
284                                if let Some(name) = symbol.name() {
285                                    btf.symbols.push(name.to_string());
286                                }
287                            });
288
289                            btf
290                        })
291                        .collect();
292
293                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
294                    handle_backtrace_signal();
295
296                    log::error!(
297                        "Suspected hang on main thread:\n{}",
298                        backtrace
299                            .iter()
300                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
301                            .collect::<Vec<_>>()
302                            .join("\n")
303                    );
304
305                    let report = HangReport {
306                        backtrace,
307                        app_version: Some(app_version),
308                        os_name: os_name.clone(),
309                        os_version: Some(os_version.clone()),
310                        architecture: env::consts::ARCH.into(),
311                        installation_id: installation_id.clone(),
312                    };
313
314                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
315                        continue;
316                    };
317
318                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
319                    else {
320                        continue;
321                    };
322
323                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
324                        continue;
325                    };
326
327                    let Ok(request) = http_client::Request::builder()
328                        .method(Method::POST)
329                        .uri(url.as_ref())
330                        .header("x-zed-checksum", checksum)
331                        .body(json_bytes.into())
332                    else {
333                        continue;
334                    };
335
336                    if let Some(response) = http_client.send(request).await.log_err() {
337                        if response.status() != 200 {
338                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
339                        }
340                    }
341                }
342            }
343        })
344        .detach()
345}
346
347fn upload_panics_and_crashes(
348    http: Arc<HttpClientWithUrl>,
349    installation_id: Option<String>,
350    cx: &AppContext,
351) {
352    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
353    cx.background_executor()
354        .spawn(async move {
355            let most_recent_panic = upload_previous_panics(http.clone(), telemetry_settings)
356                .await
357                .log_err()
358                .flatten();
359            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
360                .await
361                .log_err()
362        })
363        .detach()
364}
365
366/// Uploads panics via `zed.dev`.
367async fn upload_previous_panics(
368    http: Arc<HttpClientWithUrl>,
369    telemetry_settings: client::TelemetrySettings,
370) -> Result<Option<(i64, String)>> {
371    let panic_report_url = http.build_zed_api_url("/telemetry/panics", &[])?;
372    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
373
374    let mut most_recent_panic = None;
375
376    while let Some(child) = children.next().await {
377        let child = child?;
378        let child_path = child.path();
379
380        if child_path.extension() != Some(OsStr::new("panic")) {
381            continue;
382        }
383        let filename = if let Some(filename) = child_path.file_name() {
384            filename.to_string_lossy()
385        } else {
386            continue;
387        };
388
389        if !filename.starts_with("zed") {
390            continue;
391        }
392
393        if telemetry_settings.diagnostics {
394            let panic_file_content = smol::fs::read_to_string(&child_path)
395                .await
396                .context("error reading panic file")?;
397
398            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
399                .ok()
400                .or_else(|| {
401                    panic_file_content
402                        .lines()
403                        .next()
404                        .and_then(|line| serde_json::from_str(line).ok())
405                })
406                .unwrap_or_else(|| {
407                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
408                    None
409                });
410
411            if let Some(panic) = panic {
412                most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
413
414                let json_bytes = serde_json::to_vec(&PanicRequest { panic }).unwrap();
415
416                let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) else {
417                    continue;
418                };
419
420                let Ok(request) = http_client::Request::builder()
421                    .method(Method::POST)
422                    .uri(panic_report_url.as_ref())
423                    .header("x-zed-checksum", checksum)
424                    .body(json_bytes.into())
425                else {
426                    continue;
427                };
428
429                let response = http.send(request).await.context("error sending panic")?;
430                if !response.status().is_success() {
431                    log::error!("Error uploading panic to server: {}", response.status());
432                }
433            }
434        }
435
436        // We've done what we can, delete the file
437        std::fs::remove_file(child_path)
438            .context("error removing panic")
439            .log_err();
440    }
441    Ok::<_, anyhow::Error>(most_recent_panic)
442}
443
444const LAST_CRASH_UPLOADED: &str = "LAST_CRASH_UPLOADED";
445
446/// upload crashes from apple's diagnostic reports to our server.
447/// (only if telemetry is enabled)
448async fn upload_previous_crashes(
449    http: Arc<HttpClientWithUrl>,
450    most_recent_panic: Option<(i64, String)>,
451    installation_id: Option<String>,
452    telemetry_settings: client::TelemetrySettings,
453) -> Result<()> {
454    if !telemetry_settings.diagnostics {
455        return Ok(());
456    }
457    let last_uploaded = KEY_VALUE_STORE
458        .read_kvp(LAST_CRASH_UPLOADED)?
459        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
460    let mut uploaded = last_uploaded.clone();
461
462    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
463
464    // Crash directories are only set on macOS.
465    for dir in [crashes_dir(), crashes_retired_dir()]
466        .iter()
467        .filter_map(|d| d.as_deref())
468    {
469        let mut children = smol::fs::read_dir(&dir).await?;
470        while let Some(child) = children.next().await {
471            let child = child?;
472            let Some(filename) = child
473                .path()
474                .file_name()
475                .map(|f| f.to_string_lossy().to_lowercase())
476            else {
477                continue;
478            };
479
480            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
481                continue;
482            }
483
484            if filename <= last_uploaded {
485                continue;
486            }
487
488            let body = smol::fs::read_to_string(&child.path())
489                .await
490                .context("error reading crash file")?;
491
492            let mut request = http_client::Request::post(&crash_report_url.to_string())
493                .follow_redirects(http_client::RedirectPolicy::FollowAll)
494                .header("Content-Type", "text/plain");
495
496            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
497                request = request
498                    .header("x-zed-panicked-on", format!("{panicked_on}"))
499                    .header("x-zed-panic", payload)
500            }
501            if let Some(installation_id) = installation_id.as_ref() {
502                request = request.header("x-zed-installation-id", installation_id);
503            }
504
505            let request = request.body(body.into())?;
506
507            let response = http.send(request).await.context("error sending crash")?;
508            if !response.status().is_success() {
509                log::error!("Error uploading crash to server: {}", response.status());
510            }
511
512            if uploaded < filename {
513                uploaded.clone_from(&filename);
514                KEY_VALUE_STORE
515                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
516                    .await?;
517            }
518        }
519    }
520
521    Ok(())
522}