reliability.rs

  1use crate::stdout_is_a_pty;
  2use anyhow::{Context as _, Result};
  3use backtrace::{self, Backtrace};
  4use chrono::Utc;
  5use client::{
  6    TelemetrySettings,
  7    telemetry::{self, MINIDUMP_ENDPOINT},
  8};
  9use db::kvp::KEY_VALUE_STORE;
 10use futures::AsyncReadExt;
 11use gpui::{App, AppContext as _, SemanticVersion};
 12use http_client::{self, HttpClient, HttpClientWithUrl, HttpRequestExt, Method};
 13use paths::{crashes_dir, crashes_retired_dir};
 14use project::Project;
 15use proto::{CrashReport, GetCrashFilesResponse};
 16use release_channel::{AppCommitSha, RELEASE_CHANNEL, ReleaseChannel};
 17use reqwest::multipart::{Form, Part};
 18use settings::Settings;
 19use smol::stream::StreamExt;
 20use std::{
 21    env,
 22    ffi::{OsStr, c_void},
 23    fs,
 24    io::Write,
 25    panic,
 26    sync::{
 27        Arc,
 28        atomic::{AtomicU32, Ordering},
 29    },
 30    thread,
 31};
 32use telemetry_events::{LocationData, Panic, PanicRequest};
 33use url::Url;
 34use util::ResultExt;
 35
 36static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 37
 38pub fn init_panic_hook(
 39    app_version: SemanticVersion,
 40    app_commit_sha: Option<AppCommitSha>,
 41    system_id: Option<String>,
 42    installation_id: Option<String>,
 43    session_id: String,
 44) {
 45    let is_pty = stdout_is_a_pty();
 46
 47    panic::set_hook(Box::new(move |info| {
 48        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 49        if prior_panic_count > 0 {
 50            // Give the panic-ing thread time to write the panic file
 51            loop {
 52                thread::yield_now();
 53            }
 54        }
 55
 56        let payload = info
 57            .payload()
 58            .downcast_ref::<&str>()
 59            .map(|s| s.to_string())
 60            .or_else(|| info.payload().downcast_ref::<String>().cloned())
 61            .unwrap_or_else(|| "Box<Any>".to_string());
 62
 63        crashes::handle_panic(payload.clone(), info.location());
 64
 65        let thread = thread::current();
 66        let thread_name = thread.name().unwrap_or("<unnamed>");
 67
 68        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 69            let location = info.location().unwrap();
 70            let backtrace = Backtrace::new();
 71            eprintln!(
 72                "Thread {:?} panicked with {:?} at {}:{}:{}\n{}{:?}",
 73                thread_name,
 74                payload,
 75                location.file(),
 76                location.line(),
 77                location.column(),
 78                match app_commit_sha.as_ref() {
 79                    Some(commit_sha) => format!(
 80                        "https://github.com/zed-industries/zed/blob/{}/{}#L{} \
 81                        (may not be uploaded, line may be incorrect if files modified)\n",
 82                        commit_sha.full(),
 83                        location.file(),
 84                        location.line()
 85                    ),
 86                    None => "".to_string(),
 87                },
 88                backtrace,
 89            );
 90            std::process::exit(-1);
 91        }
 92        let main_module_base_address = get_main_module_base_address();
 93
 94        let backtrace = Backtrace::new();
 95        let mut symbols = backtrace
 96            .frames()
 97            .iter()
 98            .flat_map(|frame| {
 99                let base = frame
100                    .module_base_address()
101                    .unwrap_or(main_module_base_address);
102                frame.symbols().iter().map(move |symbol| {
103                    format!(
104                        "{}+{}",
105                        symbol
106                            .name()
107                            .as_ref()
108                            .map_or("<unknown>".to_owned(), <_>::to_string),
109                        (frame.ip() as isize).saturating_sub(base as isize)
110                    )
111                })
112            })
113            .collect::<Vec<_>>();
114
115        // Strip out leading stack frames for rust panic-handling.
116        if let Some(ix) = symbols
117            .iter()
118            .position(|name| name == "rust_begin_unwind" || name == "_rust_begin_unwind")
119        {
120            symbols.drain(0..=ix);
121        }
122
123        let panic_data = telemetry_events::Panic {
124            thread: thread_name.into(),
125            payload,
126            location_data: info.location().map(|location| LocationData {
127                file: location.file().into(),
128                line: location.line(),
129            }),
130            app_version: app_version.to_string(),
131            app_commit_sha: app_commit_sha.as_ref().map(|sha| sha.full()),
132            release_channel: RELEASE_CHANNEL.dev_name().into(),
133            target: env!("TARGET").to_owned().into(),
134            os_name: telemetry::os_name(),
135            os_version: Some(telemetry::os_version()),
136            architecture: env::consts::ARCH.into(),
137            panicked_on: Utc::now().timestamp_millis(),
138            backtrace: symbols,
139            system_id: system_id.clone(),
140            installation_id: installation_id.clone(),
141            session_id: session_id.clone(),
142        };
143
144        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
145            log::error!("{}", panic_data_json);
146        }
147        zlog::flush();
148
149        if !is_pty {
150            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
151                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
152                let panic_file_path = paths::logs_dir().join(format!("zed-{timestamp}.panic"));
153                let panic_file = fs::OpenOptions::new()
154                    .write(true)
155                    .create_new(true)
156                    .open(&panic_file_path)
157                    .log_err();
158                if let Some(mut panic_file) = panic_file {
159                    writeln!(&mut panic_file, "{panic_data_json}").log_err();
160                    panic_file.flush().log_err();
161                }
162            }
163        }
164
165        std::process::abort();
166    }));
167}
168
169#[cfg(not(target_os = "windows"))]
170fn get_main_module_base_address() -> *mut c_void {
171    let mut dl_info = libc::Dl_info {
172        dli_fname: std::ptr::null(),
173        dli_fbase: std::ptr::null_mut(),
174        dli_sname: std::ptr::null(),
175        dli_saddr: std::ptr::null_mut(),
176    };
177    unsafe {
178        libc::dladdr(get_main_module_base_address as _, &mut dl_info);
179    }
180    dl_info.dli_fbase
181}
182
183#[cfg(target_os = "windows")]
184fn get_main_module_base_address() -> *mut c_void {
185    std::ptr::null_mut()
186}
187
188pub fn init(
189    http_client: Arc<HttpClientWithUrl>,
190    system_id: Option<String>,
191    installation_id: Option<String>,
192    session_id: String,
193    cx: &mut App,
194) {
195    #[cfg(target_os = "macos")]
196    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
197
198    let Some(panic_report_url) = http_client
199        .build_zed_api_url("/telemetry/panics", &[])
200        .log_err()
201    else {
202        return;
203    };
204
205    upload_panics_and_crashes(
206        http_client.clone(),
207        panic_report_url.clone(),
208        installation_id.clone(),
209        cx,
210    );
211
212    cx.observe_new(move |project: &mut Project, _, cx| {
213        let http_client = http_client.clone();
214        let panic_report_url = panic_report_url.clone();
215        let session_id = session_id.clone();
216        let installation_id = installation_id.clone();
217        let system_id = system_id.clone();
218
219        let Some(ssh_client) = project.ssh_client() else {
220            return;
221        };
222        ssh_client.update(cx, |client, cx| {
223            if !TelemetrySettings::get_global(cx).diagnostics {
224                return;
225            }
226            let request = client.proto_client().request(proto::GetCrashFiles {});
227            cx.background_spawn(async move {
228                let GetCrashFilesResponse {
229                    legacy_panics,
230                    crashes,
231                } = request.await?;
232
233                for panic in legacy_panics {
234                    if let Some(mut panic) = serde_json::from_str::<Panic>(&panic).log_err() {
235                        panic.session_id = session_id.clone();
236                        panic.system_id = system_id.clone();
237                        panic.installation_id = installation_id.clone();
238                        upload_panic(&http_client, &panic_report_url, panic, &mut None).await?;
239                    }
240                }
241
242                let Some(endpoint) = MINIDUMP_ENDPOINT.as_ref() else {
243                    return Ok(());
244                };
245                for CrashReport {
246                    metadata,
247                    minidump_contents,
248                } in crashes
249                {
250                    if let Some(metadata) = serde_json::from_str(&metadata).log_err() {
251                        upload_minidump(
252                            http_client.clone(),
253                            endpoint,
254                            minidump_contents,
255                            &metadata,
256                        )
257                        .await
258                        .log_err();
259                    }
260                }
261
262                anyhow::Ok(())
263            })
264            .detach_and_log_err(cx);
265        })
266    })
267    .detach();
268}
269
270#[cfg(target_os = "macos")]
271pub fn monitor_main_thread_hangs(
272    http_client: Arc<HttpClientWithUrl>,
273    installation_id: Option<String>,
274    cx: &App,
275) {
276    // This is too noisy to ship to stable for now.
277    if !matches!(
278        ReleaseChannel::global(cx),
279        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
280    ) {
281        return;
282    }
283
284    use nix::sys::signal::{
285        SaFlags, SigAction, SigHandler, SigSet,
286        Signal::{self, SIGUSR2},
287        sigaction,
288    };
289
290    use parking_lot::Mutex;
291
292    use http_client::Method;
293    use std::{
294        ffi::c_int,
295        sync::{OnceLock, mpsc},
296        time::Duration,
297    };
298    use telemetry_events::{BacktraceFrame, HangReport};
299
300    use nix::sys::pthread;
301
302    let foreground_executor = cx.foreground_executor();
303    let background_executor = cx.background_executor();
304    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
305
306    // Initialize SIGUSR2 handler to send a backtrace to a channel.
307    let (backtrace_tx, backtrace_rx) = mpsc::channel();
308    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
309    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
310    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
311    BACKTRACE.lock().reserve(100);
312
313    fn handle_backtrace_signal() {
314        unsafe {
315            extern "C" fn handle_sigusr2(_i: c_int) {
316                unsafe {
317                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
318                    // which can only be triggered by This signal handler. In addition,
319                    // this signal handler is immediately removed by SA_RESETHAND, and this
320                    // signal handler cannot be re-entrant due to the SIGUSR2 mask defined
321                    // below
322                    let mut bt = BACKTRACE.lock();
323                    bt.clear();
324                    backtrace::trace_unsynchronized(|frame| {
325                        if bt.len() < bt.capacity() {
326                            bt.push(frame.clone());
327                            true
328                        } else {
329                            false
330                        }
331                    });
332                }
333
334                BACKTRACE_SENDER.get().unwrap().send(()).ok();
335            }
336
337            let mut mask = SigSet::empty();
338            mask.add(SIGUSR2);
339            sigaction(
340                Signal::SIGUSR2,
341                &SigAction::new(
342                    SigHandler::Handler(handle_sigusr2),
343                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
344                    mask,
345                ),
346            )
347            .log_err();
348        }
349    }
350
351    handle_backtrace_signal();
352    let main_thread = pthread::pthread_self();
353
354    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
355    foreground_executor
356        .spawn(async move { while (rx.next().await).is_some() {} })
357        .detach();
358
359    background_executor
360        .spawn({
361            let background_executor = background_executor.clone();
362            async move {
363                loop {
364                    background_executor.timer(Duration::from_secs(1)).await;
365                    match tx.try_send(()) {
366                        Ok(_) => continue,
367                        Err(e) => {
368                            if e.into_send_error().is_full() {
369                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
370                            }
371                            // Only detect the first hang
372                            break;
373                        }
374                    }
375                }
376            }
377        })
378        .detach();
379
380    let app_version = release_channel::AppVersion::global(cx);
381    let os_name = client::telemetry::os_name();
382
383    background_executor
384        .clone()
385        .spawn(async move {
386            let os_version = client::telemetry::os_version();
387
388            loop {
389                while backtrace_rx.recv().is_ok() {
390                    if !telemetry_settings.diagnostics {
391                        return;
392                    }
393
394                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
395                    // the backtrace transmitter has fired, which itself is only done
396                    // by the signal handler. And due to SA_RESETHAND  the signal handler
397                    // will not run again until `handle_backtrace_signal` is called.
398                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
399                    let backtrace: Vec<_> = raw_backtrace
400                        .into_iter()
401                        .map(|frame| {
402                            let mut btf = BacktraceFrame {
403                                ip: frame.ip() as usize,
404                                symbol_addr: frame.symbol_address() as usize,
405                                base: frame.module_base_address().map(|addr| addr as usize),
406                                symbols: vec![],
407                            };
408
409                            backtrace::resolve_frame(&frame, |symbol| {
410                                if let Some(name) = symbol.name() {
411                                    btf.symbols.push(name.to_string());
412                                }
413                            });
414
415                            btf
416                        })
417                        .collect();
418
419                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
420                    handle_backtrace_signal();
421
422                    log::error!(
423                        "Suspected hang on main thread:\n{}",
424                        backtrace
425                            .iter()
426                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
427                            .collect::<Vec<_>>()
428                            .join("\n")
429                    );
430
431                    let report = HangReport {
432                        backtrace,
433                        app_version: Some(app_version),
434                        os_name: os_name.clone(),
435                        os_version: Some(os_version.clone()),
436                        architecture: env::consts::ARCH.into(),
437                        installation_id: installation_id.clone(),
438                    };
439
440                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
441                        continue;
442                    };
443
444                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
445                    else {
446                        continue;
447                    };
448
449                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
450                        continue;
451                    };
452
453                    let Ok(request) = http_client::Request::builder()
454                        .method(Method::POST)
455                        .uri(url.as_ref())
456                        .header("x-zed-checksum", checksum)
457                        .body(json_bytes.into())
458                    else {
459                        continue;
460                    };
461
462                    if let Some(response) = http_client.send(request).await.log_err() {
463                        if response.status() != 200 {
464                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
465                        }
466                    }
467                }
468            }
469        })
470        .detach()
471}
472
473fn upload_panics_and_crashes(
474    http: Arc<HttpClientWithUrl>,
475    panic_report_url: Url,
476    installation_id: Option<String>,
477    cx: &App,
478) {
479    if !client::TelemetrySettings::get_global(cx).diagnostics {
480        return;
481    }
482    cx.background_spawn(async move {
483        upload_previous_minidumps(http.clone()).await.warn_on_err();
484        let most_recent_panic = upload_previous_panics(http.clone(), &panic_report_url)
485            .await
486            .log_err()
487            .flatten();
488        upload_previous_crashes(http, most_recent_panic, installation_id)
489            .await
490            .log_err();
491    })
492    .detach()
493}
494
495/// Uploads panics via `zed.dev`.
496async fn upload_previous_panics(
497    http: Arc<HttpClientWithUrl>,
498    panic_report_url: &Url,
499) -> anyhow::Result<Option<(i64, String)>> {
500    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
501
502    let mut most_recent_panic = None;
503
504    while let Some(child) = children.next().await {
505        let child = child?;
506        let child_path = child.path();
507
508        if child_path.extension() != Some(OsStr::new("panic")) {
509            continue;
510        }
511        let filename = if let Some(filename) = child_path.file_name() {
512            filename.to_string_lossy()
513        } else {
514            continue;
515        };
516
517        if !filename.starts_with("zed") {
518            continue;
519        }
520
521        let panic_file_content = smol::fs::read_to_string(&child_path)
522            .await
523            .context("error reading panic file")?;
524
525        let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
526            .log_err()
527            .or_else(|| {
528                panic_file_content
529                    .lines()
530                    .next()
531                    .and_then(|line| serde_json::from_str(line).ok())
532            })
533            .unwrap_or_else(|| {
534                log::error!("failed to deserialize panic file {:?}", panic_file_content);
535                None
536            });
537
538        if let Some(panic) = panic
539            && upload_panic(&http, &panic_report_url, panic, &mut most_recent_panic).await?
540        {
541            // We've done what we can, delete the file
542            fs::remove_file(child_path)
543                .context("error removing panic")
544                .log_err();
545        }
546    }
547
548    Ok(most_recent_panic)
549}
550
551pub async fn upload_previous_minidumps(http: Arc<HttpClientWithUrl>) -> anyhow::Result<()> {
552    let Some(minidump_endpoint) = MINIDUMP_ENDPOINT.as_ref() else {
553        log::warn!("Minidump endpoint not set");
554        return Ok(());
555    };
556
557    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
558    while let Some(child) = children.next().await {
559        let child = child?;
560        let child_path = child.path();
561        if child_path.extension() != Some(OsStr::new("dmp")) {
562            continue;
563        }
564        let mut json_path = child_path.clone();
565        json_path.set_extension("json");
566        if let Ok(metadata) = serde_json::from_slice(&smol::fs::read(&json_path).await?) {
567            if upload_minidump(
568                http.clone(),
569                &minidump_endpoint,
570                smol::fs::read(&child_path)
571                    .await
572                    .context("Failed to read minidump")?,
573                &metadata,
574            )
575            .await
576            .log_err()
577            .is_some()
578            {
579                fs::remove_file(child_path).ok();
580                fs::remove_file(json_path).ok();
581            }
582        }
583    }
584    Ok(())
585}
586
587async fn upload_minidump(
588    http: Arc<HttpClientWithUrl>,
589    endpoint: &str,
590    minidump: Vec<u8>,
591    metadata: &crashes::CrashInfo,
592) -> Result<()> {
593    let mut form = Form::new()
594        .part(
595            "upload_file_minidump",
596            Part::bytes(minidump)
597                .file_name("minidump.dmp")
598                .mime_str("application/octet-stream")?,
599        )
600        .text(
601            "sentry[tags][channel]",
602            metadata.init.release_channel.clone(),
603        )
604        .text("sentry[tags][version]", metadata.init.zed_version.clone())
605        .text("sentry[release]", metadata.init.commit_sha.clone())
606        .text("platform", "rust");
607    if let Some(panic_info) = metadata.panic.as_ref() {
608        form = form.text("sentry[logentry][formatted]", panic_info.message.clone());
609        form = form.text("span", panic_info.span.clone());
610        // TODO: add gpu-context, feature-flag-context, and more of device-context like gpu
611        // name, screen resolution, available ram, device model, etc
612    }
613
614    let mut response_text = String::new();
615    let mut response = http.send_multipart_form(endpoint, form).await?;
616    response
617        .body_mut()
618        .read_to_string(&mut response_text)
619        .await?;
620    if !response.status().is_success() {
621        anyhow::bail!("failed to upload minidump: {response_text}");
622    }
623    log::info!("Uploaded minidump. event id: {response_text}");
624    Ok(())
625}
626
627async fn upload_panic(
628    http: &Arc<HttpClientWithUrl>,
629    panic_report_url: &Url,
630    panic: telemetry_events::Panic,
631    most_recent_panic: &mut Option<(i64, String)>,
632) -> Result<bool> {
633    *most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
634
635    let json_bytes = serde_json::to_vec(&PanicRequest { panic }).unwrap();
636
637    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) else {
638        return Ok(false);
639    };
640
641    let Ok(request) = http_client::Request::builder()
642        .method(Method::POST)
643        .uri(panic_report_url.as_ref())
644        .header("x-zed-checksum", checksum)
645        .body(json_bytes.into())
646    else {
647        return Ok(false);
648    };
649
650    let response = http.send(request).await.context("error sending panic")?;
651    if !response.status().is_success() {
652        log::error!("Error uploading panic to server: {}", response.status());
653    }
654
655    Ok(true)
656}
657const LAST_CRASH_UPLOADED: &str = "LAST_CRASH_UPLOADED";
658
659/// upload crashes from apple's diagnostic reports to our server.
660/// (only if telemetry is enabled)
661async fn upload_previous_crashes(
662    http: Arc<HttpClientWithUrl>,
663    most_recent_panic: Option<(i64, String)>,
664    installation_id: Option<String>,
665) -> Result<()> {
666    let last_uploaded = KEY_VALUE_STORE
667        .read_kvp(LAST_CRASH_UPLOADED)?
668        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
669    let mut uploaded = last_uploaded.clone();
670
671    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
672
673    // Crash directories are only set on macOS.
674    for dir in [crashes_dir(), crashes_retired_dir()]
675        .iter()
676        .filter_map(|d| d.as_deref())
677    {
678        let mut children = smol::fs::read_dir(&dir).await?;
679        while let Some(child) = children.next().await {
680            let child = child?;
681            let Some(filename) = child
682                .path()
683                .file_name()
684                .map(|f| f.to_string_lossy().to_lowercase())
685            else {
686                continue;
687            };
688
689            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
690                continue;
691            }
692
693            if filename <= last_uploaded {
694                continue;
695            }
696
697            let body = smol::fs::read_to_string(&child.path())
698                .await
699                .context("error reading crash file")?;
700
701            let mut request = http_client::Request::post(&crash_report_url.to_string())
702                .follow_redirects(http_client::RedirectPolicy::FollowAll)
703                .header("Content-Type", "text/plain");
704
705            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
706                request = request
707                    .header("x-zed-panicked-on", format!("{panicked_on}"))
708                    .header("x-zed-panic", payload)
709            }
710            if let Some(installation_id) = installation_id.as_ref() {
711                request = request.header("x-zed-installation-id", installation_id);
712            }
713
714            let request = request.body(body.into())?;
715
716            let response = http.send(request).await.context("error sending crash")?;
717            if !response.status().is_success() {
718                log::error!("Error uploading crash to server: {}", response.status());
719            }
720
721            if uploaded < filename {
722                uploaded.clone_from(&filename);
723                KEY_VALUE_STORE
724                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
725                    .await?;
726            }
727        }
728    }
729
730    Ok(())
731}