reliability.rs

  1use crate::stdout_is_a_pty;
  2use anyhow::{Context as _, Result};
  3use backtrace::{self, Backtrace};
  4use chrono::Utc;
  5use client::{TelemetrySettings, telemetry};
  6use db::kvp::KEY_VALUE_STORE;
  7use gpui::{App, AppContext as _, SemanticVersion};
  8use http_client::{self, HttpClient, HttpClientWithUrl, HttpRequestExt, Method};
  9use paths::{crashes_dir, crashes_retired_dir};
 10use project::Project;
 11use release_channel::{AppCommitSha, RELEASE_CHANNEL, ReleaseChannel};
 12use settings::Settings;
 13use smol::stream::StreamExt;
 14use std::{
 15    env,
 16    ffi::{OsStr, c_void},
 17    sync::{Arc, atomic::Ordering},
 18};
 19use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
 20use telemetry_events::{LocationData, Panic, PanicRequest};
 21use url::Url;
 22use util::ResultExt;
 23
 24static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 25
 26pub fn init_panic_hook(
 27    app_version: SemanticVersion,
 28    app_commit_sha: Option<AppCommitSha>,
 29    system_id: Option<String>,
 30    installation_id: Option<String>,
 31    session_id: String,
 32) {
 33    let is_pty = stdout_is_a_pty();
 34
 35    panic::set_hook(Box::new(move |info| {
 36        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 37        if prior_panic_count > 0 {
 38            // Give the panic-ing thread time to write the panic file
 39            loop {
 40                std::thread::yield_now();
 41            }
 42        }
 43
 44        let thread = thread::current();
 45        let thread_name = thread.name().unwrap_or("<unnamed>");
 46
 47        let payload = info
 48            .payload()
 49            .downcast_ref::<&str>()
 50            .map(|s| s.to_string())
 51            .or_else(|| info.payload().downcast_ref::<String>().cloned())
 52            .unwrap_or_else(|| "Box<Any>".to_string());
 53
 54        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 55            let location = info.location().unwrap();
 56            let backtrace = Backtrace::new();
 57            eprintln!(
 58                "Thread {:?} panicked with {:?} at {}:{}:{}\n{}{:?}",
 59                thread_name,
 60                payload,
 61                location.file(),
 62                location.line(),
 63                location.column(),
 64                match app_commit_sha.as_ref() {
 65                    Some(commit_sha) => format!(
 66                        "https://github.com/zed-industries/zed/blob/{}/src/{}#L{} \
 67                        (may not be uploaded, line may be incorrect if files modified)\n",
 68                        commit_sha.full(),
 69                        location.file(),
 70                        location.line()
 71                    ),
 72                    None => "".to_string(),
 73                },
 74                backtrace,
 75            );
 76            std::process::exit(-1);
 77        }
 78        let main_module_base_address = get_main_module_base_address();
 79
 80        let backtrace = Backtrace::new();
 81        let mut symbols = backtrace
 82            .frames()
 83            .iter()
 84            .flat_map(|frame| {
 85                let base = frame
 86                    .module_base_address()
 87                    .unwrap_or(main_module_base_address);
 88                frame.symbols().iter().map(move |symbol| {
 89                    format!(
 90                        "{}+{}",
 91                        symbol
 92                            .name()
 93                            .as_ref()
 94                            .map_or("<unknown>".to_owned(), <_>::to_string),
 95                        (frame.ip() as isize).saturating_sub(base as isize)
 96                    )
 97                })
 98            })
 99            .collect::<Vec<_>>();
100
101        // Strip out leading stack frames for rust panic-handling.
102        if let Some(ix) = symbols
103            .iter()
104            .position(|name| name == "rust_begin_unwind" || name == "_rust_begin_unwind")
105        {
106            symbols.drain(0..=ix);
107        }
108
109        let panic_data = telemetry_events::Panic {
110            thread: thread_name.into(),
111            payload,
112            location_data: info.location().map(|location| LocationData {
113                file: location.file().into(),
114                line: location.line(),
115            }),
116            app_version: app_version.to_string(),
117            app_commit_sha: app_commit_sha.as_ref().map(|sha| sha.full()),
118            release_channel: RELEASE_CHANNEL.dev_name().into(),
119            target: env!("TARGET").to_owned().into(),
120            os_name: telemetry::os_name(),
121            os_version: Some(telemetry::os_version()),
122            architecture: env::consts::ARCH.into(),
123            panicked_on: Utc::now().timestamp_millis(),
124            backtrace: symbols,
125            system_id: system_id.clone(),
126            installation_id: installation_id.clone(),
127            session_id: session_id.clone(),
128        };
129
130        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
131            log::error!("{}", panic_data_json);
132        }
133        zlog::flush();
134
135        if !is_pty {
136            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
137                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
138                let panic_file_path = paths::logs_dir().join(format!("zed-{timestamp}.panic"));
139                let panic_file = std::fs::OpenOptions::new()
140                    .append(true)
141                    .create(true)
142                    .open(&panic_file_path)
143                    .log_err();
144                if let Some(mut panic_file) = panic_file {
145                    writeln!(&mut panic_file, "{panic_data_json}").log_err();
146                    panic_file.flush().log_err();
147                }
148            }
149        }
150
151        std::process::abort();
152    }));
153}
154
155#[cfg(not(target_os = "windows"))]
156fn get_main_module_base_address() -> *mut c_void {
157    let mut dl_info = libc::Dl_info {
158        dli_fname: std::ptr::null(),
159        dli_fbase: std::ptr::null_mut(),
160        dli_sname: std::ptr::null(),
161        dli_saddr: std::ptr::null_mut(),
162    };
163    unsafe {
164        libc::dladdr(get_main_module_base_address as _, &mut dl_info);
165    }
166    dl_info.dli_fbase
167}
168
169#[cfg(target_os = "windows")]
170fn get_main_module_base_address() -> *mut c_void {
171    std::ptr::null_mut()
172}
173
174pub fn init(
175    http_client: Arc<HttpClientWithUrl>,
176    system_id: Option<String>,
177    installation_id: Option<String>,
178    session_id: String,
179    cx: &mut App,
180) {
181    #[cfg(target_os = "macos")]
182    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
183
184    let Some(panic_report_url) = http_client
185        .build_zed_api_url("/telemetry/panics", &[])
186        .log_err()
187    else {
188        return;
189    };
190
191    upload_panics_and_crashes(
192        http_client.clone(),
193        panic_report_url.clone(),
194        installation_id.clone(),
195        cx,
196    );
197
198    cx.observe_new(move |project: &mut Project, _, cx| {
199        let http_client = http_client.clone();
200        let panic_report_url = panic_report_url.clone();
201        let session_id = session_id.clone();
202        let installation_id = installation_id.clone();
203        let system_id = system_id.clone();
204
205        if let Some(ssh_client) = project.ssh_client() {
206            ssh_client.update(cx, |client, cx| {
207                if TelemetrySettings::get_global(cx).diagnostics {
208                    let request = client.proto_client().request(proto::GetPanicFiles {});
209                    cx.background_spawn(async move {
210                        let panic_files = request.await?;
211                        for file in panic_files.file_contents {
212                            let panic: Option<Panic> = serde_json::from_str(&file)
213                                .log_err()
214                                .or_else(|| {
215                                    file.lines()
216                                        .next()
217                                        .and_then(|line| serde_json::from_str(line).ok())
218                                })
219                                .unwrap_or_else(|| {
220                                    log::error!("failed to deserialize panic file {:?}", file);
221                                    None
222                                });
223
224                            if let Some(mut panic) = panic {
225                                panic.session_id = session_id.clone();
226                                panic.system_id = system_id.clone();
227                                panic.installation_id = installation_id.clone();
228
229                                upload_panic(&http_client, &panic_report_url, panic, &mut None)
230                                    .await?;
231                            }
232                        }
233
234                        anyhow::Ok(())
235                    })
236                    .detach_and_log_err(cx);
237                }
238            })
239        }
240    })
241    .detach();
242}
243
244#[cfg(target_os = "macos")]
245pub fn monitor_main_thread_hangs(
246    http_client: Arc<HttpClientWithUrl>,
247    installation_id: Option<String>,
248    cx: &App,
249) {
250    // This is too noisy to ship to stable for now.
251    if !matches!(
252        ReleaseChannel::global(cx),
253        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
254    ) {
255        return;
256    }
257
258    use nix::sys::signal::{
259        SaFlags, SigAction, SigHandler, SigSet,
260        Signal::{self, SIGUSR2},
261        sigaction,
262    };
263
264    use parking_lot::Mutex;
265
266    use http_client::Method;
267    use std::{
268        ffi::c_int,
269        sync::{OnceLock, mpsc},
270        time::Duration,
271    };
272    use telemetry_events::{BacktraceFrame, HangReport};
273
274    use nix::sys::pthread;
275
276    let foreground_executor = cx.foreground_executor();
277    let background_executor = cx.background_executor();
278    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
279
280    // Initialize SIGUSR2 handler to send a backtrace to a channel.
281    let (backtrace_tx, backtrace_rx) = mpsc::channel();
282    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
283    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
284    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
285    BACKTRACE.lock().reserve(100);
286
287    fn handle_backtrace_signal() {
288        unsafe {
289            extern "C" fn handle_sigusr2(_i: c_int) {
290                unsafe {
291                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
292                    // which can only be triggered by This signal handler. In addition,
293                    // this signal handler is immediately removed by SA_RESETHAND, and this
294                    // signal handler cannot be re-entrant due to the SIGUSR2 mask defined
295                    // below
296                    let mut bt = BACKTRACE.lock();
297                    bt.clear();
298                    backtrace::trace_unsynchronized(|frame| {
299                        if bt.len() < bt.capacity() {
300                            bt.push(frame.clone());
301                            true
302                        } else {
303                            false
304                        }
305                    });
306                }
307
308                BACKTRACE_SENDER.get().unwrap().send(()).ok();
309            }
310
311            let mut mask = SigSet::empty();
312            mask.add(SIGUSR2);
313            sigaction(
314                Signal::SIGUSR2,
315                &SigAction::new(
316                    SigHandler::Handler(handle_sigusr2),
317                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
318                    mask,
319                ),
320            )
321            .log_err();
322        }
323    }
324
325    handle_backtrace_signal();
326    let main_thread = pthread::pthread_self();
327
328    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
329    foreground_executor
330        .spawn(async move { while (rx.next().await).is_some() {} })
331        .detach();
332
333    background_executor
334        .spawn({
335            let background_executor = background_executor.clone();
336            async move {
337                loop {
338                    background_executor.timer(Duration::from_secs(1)).await;
339                    match tx.try_send(()) {
340                        Ok(_) => continue,
341                        Err(e) => {
342                            if e.into_send_error().is_full() {
343                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
344                            }
345                            // Only detect the first hang
346                            break;
347                        }
348                    }
349                }
350            }
351        })
352        .detach();
353
354    let app_version = release_channel::AppVersion::global(cx);
355    let os_name = client::telemetry::os_name();
356
357    background_executor
358        .clone()
359        .spawn(async move {
360            let os_version = client::telemetry::os_version();
361
362            loop {
363                while backtrace_rx.recv().is_ok() {
364                    if !telemetry_settings.diagnostics {
365                        return;
366                    }
367
368                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
369                    // the backtrace transmitter has fired, which itself is only done
370                    // by the signal handler. And due to SA_RESETHAND  the signal handler
371                    // will not run again until `handle_backtrace_signal` is called.
372                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
373                    let backtrace: Vec<_> = raw_backtrace
374                        .into_iter()
375                        .map(|frame| {
376                            let mut btf = BacktraceFrame {
377                                ip: frame.ip() as usize,
378                                symbol_addr: frame.symbol_address() as usize,
379                                base: frame.module_base_address().map(|addr| addr as usize),
380                                symbols: vec![],
381                            };
382
383                            backtrace::resolve_frame(&frame, |symbol| {
384                                if let Some(name) = symbol.name() {
385                                    btf.symbols.push(name.to_string());
386                                }
387                            });
388
389                            btf
390                        })
391                        .collect();
392
393                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
394                    handle_backtrace_signal();
395
396                    log::error!(
397                        "Suspected hang on main thread:\n{}",
398                        backtrace
399                            .iter()
400                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
401                            .collect::<Vec<_>>()
402                            .join("\n")
403                    );
404
405                    let report = HangReport {
406                        backtrace,
407                        app_version: Some(app_version),
408                        os_name: os_name.clone(),
409                        os_version: Some(os_version.clone()),
410                        architecture: env::consts::ARCH.into(),
411                        installation_id: installation_id.clone(),
412                    };
413
414                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
415                        continue;
416                    };
417
418                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
419                    else {
420                        continue;
421                    };
422
423                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
424                        continue;
425                    };
426
427                    let Ok(request) = http_client::Request::builder()
428                        .method(Method::POST)
429                        .uri(url.as_ref())
430                        .header("x-zed-checksum", checksum)
431                        .body(json_bytes.into())
432                    else {
433                        continue;
434                    };
435
436                    if let Some(response) = http_client.send(request).await.log_err() {
437                        if response.status() != 200 {
438                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
439                        }
440                    }
441                }
442            }
443        })
444        .detach()
445}
446
447fn upload_panics_and_crashes(
448    http: Arc<HttpClientWithUrl>,
449    panic_report_url: Url,
450    installation_id: Option<String>,
451    cx: &App,
452) {
453    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
454    cx.background_spawn(async move {
455        let most_recent_panic =
456            upload_previous_panics(http.clone(), &panic_report_url, telemetry_settings)
457                .await
458                .log_err()
459                .flatten();
460        upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
461            .await
462            .log_err()
463    })
464    .detach()
465}
466
467/// Uploads panics via `zed.dev`.
468async fn upload_previous_panics(
469    http: Arc<HttpClientWithUrl>,
470    panic_report_url: &Url,
471    telemetry_settings: client::TelemetrySettings,
472) -> anyhow::Result<Option<(i64, String)>> {
473    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
474
475    let mut most_recent_panic = None;
476
477    while let Some(child) = children.next().await {
478        let child = child?;
479        let child_path = child.path();
480
481        if child_path.extension() != Some(OsStr::new("panic")) {
482            continue;
483        }
484        let filename = if let Some(filename) = child_path.file_name() {
485            filename.to_string_lossy()
486        } else {
487            continue;
488        };
489
490        if !filename.starts_with("zed") {
491            continue;
492        }
493
494        if telemetry_settings.diagnostics {
495            let panic_file_content = smol::fs::read_to_string(&child_path)
496                .await
497                .context("error reading panic file")?;
498
499            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
500                .log_err()
501                .or_else(|| {
502                    panic_file_content
503                        .lines()
504                        .next()
505                        .and_then(|line| serde_json::from_str(line).ok())
506                })
507                .unwrap_or_else(|| {
508                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
509                    None
510                });
511
512            if let Some(panic) = panic {
513                if !upload_panic(&http, &panic_report_url, panic, &mut most_recent_panic).await? {
514                    continue;
515                }
516            }
517        }
518
519        // We've done what we can, delete the file
520        std::fs::remove_file(child_path)
521            .context("error removing panic")
522            .log_err();
523    }
524    Ok(most_recent_panic)
525}
526
527async fn upload_panic(
528    http: &Arc<HttpClientWithUrl>,
529    panic_report_url: &Url,
530    panic: telemetry_events::Panic,
531    most_recent_panic: &mut Option<(i64, String)>,
532) -> Result<bool> {
533    *most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
534
535    let json_bytes = serde_json::to_vec(&PanicRequest { panic }).unwrap();
536
537    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) else {
538        return Ok(false);
539    };
540
541    let Ok(request) = http_client::Request::builder()
542        .method(Method::POST)
543        .uri(panic_report_url.as_ref())
544        .header("x-zed-checksum", checksum)
545        .body(json_bytes.into())
546    else {
547        return Ok(false);
548    };
549
550    let response = http.send(request).await.context("error sending panic")?;
551    if !response.status().is_success() {
552        log::error!("Error uploading panic to server: {}", response.status());
553    }
554
555    Ok(true)
556}
557const LAST_CRASH_UPLOADED: &str = "LAST_CRASH_UPLOADED";
558
559/// upload crashes from apple's diagnostic reports to our server.
560/// (only if telemetry is enabled)
561async fn upload_previous_crashes(
562    http: Arc<HttpClientWithUrl>,
563    most_recent_panic: Option<(i64, String)>,
564    installation_id: Option<String>,
565    telemetry_settings: client::TelemetrySettings,
566) -> Result<()> {
567    if !telemetry_settings.diagnostics {
568        return Ok(());
569    }
570    let last_uploaded = KEY_VALUE_STORE
571        .read_kvp(LAST_CRASH_UPLOADED)?
572        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
573    let mut uploaded = last_uploaded.clone();
574
575    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
576
577    // Crash directories are only set on macOS.
578    for dir in [crashes_dir(), crashes_retired_dir()]
579        .iter()
580        .filter_map(|d| d.as_deref())
581    {
582        let mut children = smol::fs::read_dir(&dir).await?;
583        while let Some(child) = children.next().await {
584            let child = child?;
585            let Some(filename) = child
586                .path()
587                .file_name()
588                .map(|f| f.to_string_lossy().to_lowercase())
589            else {
590                continue;
591            };
592
593            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
594                continue;
595            }
596
597            if filename <= last_uploaded {
598                continue;
599            }
600
601            let body = smol::fs::read_to_string(&child.path())
602                .await
603                .context("error reading crash file")?;
604
605            let mut request = http_client::Request::post(&crash_report_url.to_string())
606                .follow_redirects(http_client::RedirectPolicy::FollowAll)
607                .header("Content-Type", "text/plain");
608
609            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
610                request = request
611                    .header("x-zed-panicked-on", format!("{panicked_on}"))
612                    .header("x-zed-panic", payload)
613            }
614            if let Some(installation_id) = installation_id.as_ref() {
615                request = request.header("x-zed-installation-id", installation_id);
616            }
617
618            let request = request.body(body.into())?;
619
620            let response = http.send(request).await.context("error sending crash")?;
621            if !response.status().is_success() {
622                log::error!("Error uploading crash to server: {}", response.status());
623            }
624
625            if uploaded < filename {
626                uploaded.clone_from(&filename);
627                KEY_VALUE_STORE
628                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
629                    .await?;
630            }
631        }
632    }
633
634    Ok(())
635}