reliability.rs

  1use crate::stdout_is_a_pty;
  2use anyhow::{Context as _, Result};
  3use backtrace::{self, Backtrace};
  4use chrono::Utc;
  5use client::{telemetry, TelemetrySettings};
  6use db::kvp::KEY_VALUE_STORE;
  7use gpui::{App, SemanticVersion};
  8use http_client::{self, HttpClient, HttpClientWithUrl, HttpRequestExt, Method};
  9use paths::{crashes_dir, crashes_retired_dir};
 10use project::Project;
 11use release_channel::{AppCommitSha, ReleaseChannel, RELEASE_CHANNEL};
 12use settings::Settings;
 13use smol::stream::StreamExt;
 14use std::{
 15    env,
 16    ffi::{c_void, OsStr},
 17    sync::{atomic::Ordering, Arc},
 18};
 19use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
 20use telemetry_events::{LocationData, Panic, PanicRequest};
 21use url::Url;
 22use util::ResultExt;
 23
 24static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 25
 26pub fn init_panic_hook(
 27    app_version: SemanticVersion,
 28    app_commit_sha: Option<AppCommitSha>,
 29    system_id: Option<String>,
 30    installation_id: Option<String>,
 31    session_id: String,
 32) {
 33    let is_pty = stdout_is_a_pty();
 34
 35    panic::set_hook(Box::new(move |info| {
 36        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 37        if prior_panic_count > 0 {
 38            // Give the panic-ing thread time to write the panic file
 39            loop {
 40                std::thread::yield_now();
 41            }
 42        }
 43
 44        let thread = thread::current();
 45        let thread_name = thread.name().unwrap_or("<unnamed>");
 46
 47        let payload = info
 48            .payload()
 49            .downcast_ref::<&str>()
 50            .map(|s| s.to_string())
 51            .or_else(|| info.payload().downcast_ref::<String>().cloned())
 52            .unwrap_or_else(|| "Box<Any>".to_string());
 53
 54        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 55            let location = info.location().unwrap();
 56            let backtrace = Backtrace::new();
 57            eprintln!(
 58                "Thread {:?} panicked with {:?} at {}:{}:{}\n{}{:?}",
 59                thread_name,
 60                payload,
 61                location.file(),
 62                location.line(),
 63                location.column(),
 64                match app_commit_sha.as_ref() {
 65                    Some(commit_sha) => format!(
 66                        "https://github.com/zed-industries/zed/blob/{}/src/{}#L{} \
 67                        (may not be uploaded, line may be incorrect if files modified)\n",
 68                        commit_sha.0,
 69                        location.file(),
 70                        location.line()
 71                    ),
 72                    None => "".to_string(),
 73                },
 74                backtrace,
 75            );
 76            std::process::exit(-1);
 77        }
 78        let main_module_base_address = get_main_module_base_address();
 79
 80        let backtrace = Backtrace::new();
 81        let mut symbols = backtrace
 82            .frames()
 83            .iter()
 84            .flat_map(|frame| {
 85                let base = frame
 86                    .module_base_address()
 87                    .unwrap_or(main_module_base_address);
 88                frame.symbols().iter().map(move |symbol| {
 89                    format!(
 90                        "{}+{}",
 91                        symbol
 92                            .name()
 93                            .as_ref()
 94                            .map_or("<unknown>".to_owned(), <_>::to_string),
 95                        (frame.ip() as isize).saturating_sub(base as isize)
 96                    )
 97                })
 98            })
 99            .collect::<Vec<_>>();
100
101        // Strip out leading stack frames for rust panic-handling.
102        if let Some(ix) = symbols
103            .iter()
104            .position(|name| name == "rust_begin_unwind" || name == "_rust_begin_unwind")
105        {
106            symbols.drain(0..=ix);
107        }
108
109        let panic_data = telemetry_events::Panic {
110            thread: thread_name.into(),
111            payload,
112            location_data: info.location().map(|location| LocationData {
113                file: location.file().into(),
114                line: location.line(),
115            }),
116            app_version: app_version.to_string(),
117            app_commit_sha: app_commit_sha.as_ref().map(|sha| sha.0.clone()),
118            release_channel: RELEASE_CHANNEL.dev_name().into(),
119            target: env!("TARGET").to_owned().into(),
120            os_name: telemetry::os_name(),
121            os_version: Some(telemetry::os_version()),
122            architecture: env::consts::ARCH.into(),
123            panicked_on: Utc::now().timestamp_millis(),
124            backtrace: symbols,
125            system_id: system_id.clone(),
126            installation_id: installation_id.clone(),
127            session_id: session_id.clone(),
128        };
129
130        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
131            log::error!("{}", panic_data_json);
132        }
133
134        if !is_pty {
135            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
136                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
137                let panic_file_path = paths::logs_dir().join(format!("zed-{timestamp}.panic"));
138                let panic_file = std::fs::OpenOptions::new()
139                    .append(true)
140                    .create(true)
141                    .open(&panic_file_path)
142                    .log_err();
143                if let Some(mut panic_file) = panic_file {
144                    writeln!(&mut panic_file, "{panic_data_json}").log_err();
145                    panic_file.flush().log_err();
146                }
147            }
148        }
149
150        std::process::abort();
151    }));
152}
153
154#[cfg(not(target_os = "windows"))]
155fn get_main_module_base_address() -> *mut c_void {
156    let mut dl_info = libc::Dl_info {
157        dli_fname: std::ptr::null(),
158        dli_fbase: std::ptr::null_mut(),
159        dli_sname: std::ptr::null(),
160        dli_saddr: std::ptr::null_mut(),
161    };
162    unsafe {
163        libc::dladdr(get_main_module_base_address as _, &mut dl_info);
164    }
165    dl_info.dli_fbase
166}
167
168#[cfg(target_os = "windows")]
169fn get_main_module_base_address() -> *mut c_void {
170    std::ptr::null_mut()
171}
172
173pub fn init(
174    http_client: Arc<HttpClientWithUrl>,
175    system_id: Option<String>,
176    installation_id: Option<String>,
177    session_id: String,
178    cx: &mut App,
179) {
180    #[cfg(target_os = "macos")]
181    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
182
183    let Some(panic_report_url) = http_client
184        .build_zed_api_url("/telemetry/panics", &[])
185        .log_err()
186    else {
187        return;
188    };
189
190    upload_panics_and_crashes(
191        http_client.clone(),
192        panic_report_url.clone(),
193        installation_id.clone(),
194        cx,
195    );
196
197    cx.observe_new(move |project: &mut Project, _, cx| {
198        let http_client = http_client.clone();
199        let panic_report_url = panic_report_url.clone();
200        let session_id = session_id.clone();
201        let installation_id = installation_id.clone();
202        let system_id = system_id.clone();
203
204        if let Some(ssh_client) = project.ssh_client() {
205            ssh_client.update(cx, |client, cx| {
206                if TelemetrySettings::get_global(cx).diagnostics {
207                    let request = client.proto_client().request(proto::GetPanicFiles {});
208                    cx.background_executor()
209                        .spawn(async move {
210                            let panic_files = request.await?;
211                            for file in panic_files.file_contents {
212                                let panic: Option<Panic> = serde_json::from_str(&file)
213                                    .log_err()
214                                    .or_else(|| {
215                                        file.lines()
216                                            .next()
217                                            .and_then(|line| serde_json::from_str(line).ok())
218                                    })
219                                    .unwrap_or_else(|| {
220                                        log::error!("failed to deserialize panic file {:?}", file);
221                                        None
222                                    });
223
224                                if let Some(mut panic) = panic {
225                                    panic.session_id = session_id.clone();
226                                    panic.system_id = system_id.clone();
227                                    panic.installation_id = installation_id.clone();
228
229                                    upload_panic(&http_client, &panic_report_url, panic, &mut None)
230                                        .await?;
231                                }
232                            }
233
234                            anyhow::Ok(())
235                        })
236                        .detach_and_log_err(cx);
237                }
238            })
239        }
240    })
241    .detach();
242}
243
244#[cfg(target_os = "macos")]
245pub fn monitor_main_thread_hangs(
246    http_client: Arc<HttpClientWithUrl>,
247    installation_id: Option<String>,
248    cx: &App,
249) {
250    // This is too noisy to ship to stable for now.
251    if !matches!(
252        ReleaseChannel::global(cx),
253        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
254    ) {
255        return;
256    }
257
258    use nix::sys::signal::{
259        sigaction, SaFlags, SigAction, SigHandler, SigSet,
260        Signal::{self, SIGUSR2},
261    };
262
263    use parking_lot::Mutex;
264
265    use http_client::Method;
266    use std::{
267        ffi::c_int,
268        sync::{mpsc, OnceLock},
269        time::Duration,
270    };
271    use telemetry_events::{BacktraceFrame, HangReport};
272
273    use nix::sys::pthread;
274
275    let foreground_executor = cx.foreground_executor();
276    let background_executor = cx.background_executor();
277    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
278
279    // Initialize SIGUSR2 handler to send a backtrace to a channel.
280    let (backtrace_tx, backtrace_rx) = mpsc::channel();
281    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
282    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
283    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
284    BACKTRACE.lock().reserve(100);
285
286    fn handle_backtrace_signal() {
287        unsafe {
288            extern "C" fn handle_sigusr2(_i: c_int) {
289                unsafe {
290                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
291                    // which can only be triggered by This signal handler. In addition,
292                    // this signal handler is immediately removed by SA_RESETHAND, and this
293                    // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined
294                    // below
295                    let mut bt = BACKTRACE.lock();
296                    bt.clear();
297                    backtrace::trace_unsynchronized(|frame| {
298                        if bt.len() < bt.capacity() {
299                            bt.push(frame.clone());
300                            true
301                        } else {
302                            false
303                        }
304                    });
305                }
306
307                BACKTRACE_SENDER.get().unwrap().send(()).ok();
308            }
309
310            let mut mask = SigSet::empty();
311            mask.add(SIGUSR2);
312            sigaction(
313                Signal::SIGUSR2,
314                &SigAction::new(
315                    SigHandler::Handler(handle_sigusr2),
316                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
317                    mask,
318                ),
319            )
320            .log_err();
321        }
322    }
323
324    handle_backtrace_signal();
325    let main_thread = pthread::pthread_self();
326
327    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
328    foreground_executor
329        .spawn(async move { while (rx.next().await).is_some() {} })
330        .detach();
331
332    background_executor
333        .spawn({
334            let background_executor = background_executor.clone();
335            async move {
336                loop {
337                    background_executor.timer(Duration::from_secs(1)).await;
338                    match tx.try_send(()) {
339                        Ok(_) => continue,
340                        Err(e) => {
341                            if e.into_send_error().is_full() {
342                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
343                            }
344                            // Only detect the first hang
345                            break;
346                        }
347                    }
348                }
349            }
350        })
351        .detach();
352
353    let app_version = release_channel::AppVersion::global(cx);
354    let os_name = client::telemetry::os_name();
355
356    background_executor
357        .clone()
358        .spawn(async move {
359            let os_version = client::telemetry::os_version();
360
361            loop {
362                while backtrace_rx.recv().is_ok() {
363                    if !telemetry_settings.diagnostics {
364                        return;
365                    }
366
367                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
368                    // the backtrace transmitter has fired, which itself is only done
369                    // by the signal handler. And due to SA_RESETHAND  the signal handler
370                    // will not run again until `handle_backtrace_signal` is called.
371                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
372                    let backtrace: Vec<_> = raw_backtrace
373                        .into_iter()
374                        .map(|frame| {
375                            let mut btf = BacktraceFrame {
376                                ip: frame.ip() as usize,
377                                symbol_addr: frame.symbol_address() as usize,
378                                base: frame.module_base_address().map(|addr| addr as usize),
379                                symbols: vec![],
380                            };
381
382                            backtrace::resolve_frame(&frame, |symbol| {
383                                if let Some(name) = symbol.name() {
384                                    btf.symbols.push(name.to_string());
385                                }
386                            });
387
388                            btf
389                        })
390                        .collect();
391
392                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
393                    handle_backtrace_signal();
394
395                    log::error!(
396                        "Suspected hang on main thread:\n{}",
397                        backtrace
398                            .iter()
399                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
400                            .collect::<Vec<_>>()
401                            .join("\n")
402                    );
403
404                    let report = HangReport {
405                        backtrace,
406                        app_version: Some(app_version),
407                        os_name: os_name.clone(),
408                        os_version: Some(os_version.clone()),
409                        architecture: env::consts::ARCH.into(),
410                        installation_id: installation_id.clone(),
411                    };
412
413                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
414                        continue;
415                    };
416
417                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
418                    else {
419                        continue;
420                    };
421
422                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
423                        continue;
424                    };
425
426                    let Ok(request) = http_client::Request::builder()
427                        .method(Method::POST)
428                        .uri(url.as_ref())
429                        .header("x-zed-checksum", checksum)
430                        .body(json_bytes.into())
431                    else {
432                        continue;
433                    };
434
435                    if let Some(response) = http_client.send(request).await.log_err() {
436                        if response.status() != 200 {
437                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
438                        }
439                    }
440                }
441            }
442        })
443        .detach()
444}
445
446fn upload_panics_and_crashes(
447    http: Arc<HttpClientWithUrl>,
448    panic_report_url: Url,
449    installation_id: Option<String>,
450    cx: &App,
451) {
452    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
453    cx.background_executor()
454        .spawn(async move {
455            let most_recent_panic =
456                upload_previous_panics(http.clone(), &panic_report_url, telemetry_settings)
457                    .await
458                    .log_err()
459                    .flatten();
460            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
461                .await
462                .log_err()
463        })
464        .detach()
465}
466
467/// Uploads panics via `zed.dev`.
468async fn upload_previous_panics(
469    http: Arc<HttpClientWithUrl>,
470    panic_report_url: &Url,
471    telemetry_settings: client::TelemetrySettings,
472) -> anyhow::Result<Option<(i64, String)>> {
473    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
474
475    let mut most_recent_panic = None;
476
477    while let Some(child) = children.next().await {
478        let child = child?;
479        let child_path = child.path();
480
481        if child_path.extension() != Some(OsStr::new("panic")) {
482            continue;
483        }
484        let filename = if let Some(filename) = child_path.file_name() {
485            filename.to_string_lossy()
486        } else {
487            continue;
488        };
489
490        if !filename.starts_with("zed") {
491            continue;
492        }
493
494        if telemetry_settings.diagnostics {
495            let panic_file_content = smol::fs::read_to_string(&child_path)
496                .await
497                .context("error reading panic file")?;
498
499            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
500                .log_err()
501                .or_else(|| {
502                    panic_file_content
503                        .lines()
504                        .next()
505                        .and_then(|line| serde_json::from_str(line).ok())
506                })
507                .unwrap_or_else(|| {
508                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
509                    None
510                });
511
512            if let Some(panic) = panic {
513                if !upload_panic(&http, &panic_report_url, panic, &mut most_recent_panic).await? {
514                    continue;
515                }
516            }
517        }
518
519        // We've done what we can, delete the file
520        std::fs::remove_file(child_path)
521            .context("error removing panic")
522            .log_err();
523    }
524    Ok(most_recent_panic)
525}
526
527async fn upload_panic(
528    http: &Arc<HttpClientWithUrl>,
529    panic_report_url: &Url,
530    panic: telemetry_events::Panic,
531    most_recent_panic: &mut Option<(i64, String)>,
532) -> Result<bool> {
533    *most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
534
535    let json_bytes = serde_json::to_vec(&PanicRequest { panic }).unwrap();
536
537    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) else {
538        return Ok(false);
539    };
540
541    let Ok(request) = http_client::Request::builder()
542        .method(Method::POST)
543        .uri(panic_report_url.as_ref())
544        .header("x-zed-checksum", checksum)
545        .body(json_bytes.into())
546    else {
547        return Ok(false);
548    };
549
550    let response = http.send(request).await.context("error sending panic")?;
551    if !response.status().is_success() {
552        log::error!("Error uploading panic to server: {}", response.status());
553    }
554
555    Ok(true)
556}
557const LAST_CRASH_UPLOADED: &str = "LAST_CRASH_UPLOADED";
558
559/// upload crashes from apple's diagnostic reports to our server.
560/// (only if telemetry is enabled)
561async fn upload_previous_crashes(
562    http: Arc<HttpClientWithUrl>,
563    most_recent_panic: Option<(i64, String)>,
564    installation_id: Option<String>,
565    telemetry_settings: client::TelemetrySettings,
566) -> Result<()> {
567    if !telemetry_settings.diagnostics {
568        return Ok(());
569    }
570    let last_uploaded = KEY_VALUE_STORE
571        .read_kvp(LAST_CRASH_UPLOADED)?
572        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
573    let mut uploaded = last_uploaded.clone();
574
575    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
576
577    // Crash directories are only set on macOS.
578    for dir in [crashes_dir(), crashes_retired_dir()]
579        .iter()
580        .filter_map(|d| d.as_deref())
581    {
582        let mut children = smol::fs::read_dir(&dir).await?;
583        while let Some(child) = children.next().await {
584            let child = child?;
585            let Some(filename) = child
586                .path()
587                .file_name()
588                .map(|f| f.to_string_lossy().to_lowercase())
589            else {
590                continue;
591            };
592
593            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
594                continue;
595            }
596
597            if filename <= last_uploaded {
598                continue;
599            }
600
601            let body = smol::fs::read_to_string(&child.path())
602                .await
603                .context("error reading crash file")?;
604
605            let mut request = http_client::Request::post(&crash_report_url.to_string())
606                .follow_redirects(http_client::RedirectPolicy::FollowAll)
607                .header("Content-Type", "text/plain");
608
609            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
610                request = request
611                    .header("x-zed-panicked-on", format!("{panicked_on}"))
612                    .header("x-zed-panic", payload)
613            }
614            if let Some(installation_id) = installation_id.as_ref() {
615                request = request.header("x-zed-installation-id", installation_id);
616            }
617
618            let request = request.body(body.into())?;
619
620            let response = http.send(request).await.context("error sending crash")?;
621            if !response.status().is_success() {
622                log::error!("Error uploading crash to server: {}", response.status());
623            }
624
625            if uploaded < filename {
626                uploaded.clone_from(&filename);
627                KEY_VALUE_STORE
628                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
629                    .await?;
630            }
631        }
632    }
633
634    Ok(())
635}