reliability.rs

  1use anyhow::{Context, Result};
  2use backtrace::{self, Backtrace};
  3use chrono::Utc;
  4use client::{telemetry, TelemetrySettings};
  5use db::kvp::KEY_VALUE_STORE;
  6use gpui::{AppContext, SemanticVersion};
  7use http_client::{HttpRequestExt, Method};
  8
  9use http_client::{self, HttpClient, HttpClientWithUrl};
 10use paths::{crashes_dir, crashes_retired_dir};
 11use project::Project;
 12use release_channel::ReleaseChannel;
 13use release_channel::RELEASE_CHANNEL;
 14use settings::Settings;
 15use smol::stream::StreamExt;
 16use std::{
 17    env,
 18    ffi::OsStr,
 19    sync::{atomic::Ordering, Arc},
 20};
 21use std::{io::Write, panic, sync::atomic::AtomicU32, thread};
 22use telemetry_events::LocationData;
 23use telemetry_events::Panic;
 24use telemetry_events::PanicRequest;
 25use url::Url;
 26use util::ResultExt;
 27
 28use crate::stdout_is_a_pty;
 29static PANIC_COUNT: AtomicU32 = AtomicU32::new(0);
 30
 31pub fn init_panic_hook(
 32    app_version: SemanticVersion,
 33    system_id: Option<String>,
 34    installation_id: Option<String>,
 35    session_id: String,
 36) {
 37    let is_pty = stdout_is_a_pty();
 38
 39    panic::set_hook(Box::new(move |info| {
 40        let prior_panic_count = PANIC_COUNT.fetch_add(1, Ordering::SeqCst);
 41        if prior_panic_count > 0 {
 42            // Give the panic-ing thread time to write the panic file
 43            loop {
 44                std::thread::yield_now();
 45            }
 46        }
 47
 48        let thread = thread::current();
 49        let thread_name = thread.name().unwrap_or("<unnamed>");
 50
 51        let payload = info
 52            .payload()
 53            .downcast_ref::<&str>()
 54            .map(|s| s.to_string())
 55            .or_else(|| info.payload().downcast_ref::<String>().cloned())
 56            .unwrap_or_else(|| "Box<Any>".to_string());
 57
 58        if *release_channel::RELEASE_CHANNEL == ReleaseChannel::Dev {
 59            let location = info.location().unwrap();
 60            let backtrace = Backtrace::new();
 61            eprintln!(
 62                "Thread {:?} panicked with {:?} at {}:{}:{}\n{:?}",
 63                thread_name,
 64                payload,
 65                location.file(),
 66                location.line(),
 67                location.column(),
 68                backtrace,
 69            );
 70            std::process::exit(-1);
 71        }
 72
 73        let backtrace = Backtrace::new();
 74        let mut backtrace = backtrace
 75            .frames()
 76            .iter()
 77            .flat_map(|frame| {
 78                frame
 79                    .symbols()
 80                    .iter()
 81                    .filter_map(|frame| Some(format!("{:#}", frame.name()?)))
 82            })
 83            .collect::<Vec<_>>();
 84
 85        // Strip out leading stack frames for rust panic-handling.
 86        if let Some(ix) = backtrace
 87            .iter()
 88            .position(|name| name == "rust_begin_unwind")
 89        {
 90            backtrace.drain(0..=ix);
 91        }
 92
 93        let panic_data = telemetry_events::Panic {
 94            thread: thread_name.into(),
 95            payload,
 96            location_data: info.location().map(|location| LocationData {
 97                file: location.file().into(),
 98                line: location.line(),
 99            }),
100            app_version: app_version.to_string(),
101            release_channel: RELEASE_CHANNEL.display_name().into(),
102            os_name: telemetry::os_name(),
103            os_version: Some(telemetry::os_version()),
104            architecture: env::consts::ARCH.into(),
105            panicked_on: Utc::now().timestamp_millis(),
106            backtrace,
107            system_id: system_id.clone(),
108            installation_id: installation_id.clone(),
109            session_id: session_id.clone(),
110        };
111
112        if let Some(panic_data_json) = serde_json::to_string_pretty(&panic_data).log_err() {
113            log::error!("{}", panic_data_json);
114        }
115
116        if !is_pty {
117            if let Some(panic_data_json) = serde_json::to_string(&panic_data).log_err() {
118                let timestamp = chrono::Utc::now().format("%Y_%m_%d %H_%M_%S").to_string();
119                let panic_file_path = paths::logs_dir().join(format!("zed-{timestamp}.panic"));
120                let panic_file = std::fs::OpenOptions::new()
121                    .append(true)
122                    .create(true)
123                    .open(&panic_file_path)
124                    .log_err();
125                if let Some(mut panic_file) = panic_file {
126                    writeln!(&mut panic_file, "{panic_data_json}").log_err();
127                    panic_file.flush().log_err();
128                }
129            }
130        }
131
132        std::process::abort();
133    }));
134}
135
136pub fn init(
137    http_client: Arc<HttpClientWithUrl>,
138    system_id: Option<String>,
139    installation_id: Option<String>,
140    session_id: String,
141    cx: &mut AppContext,
142) {
143    #[cfg(target_os = "macos")]
144    monitor_main_thread_hangs(http_client.clone(), installation_id.clone(), cx);
145
146    let Some(panic_report_url) = http_client
147        .build_zed_api_url("/telemetry/panics", &[])
148        .log_err()
149    else {
150        return;
151    };
152
153    upload_panics_and_crashes(
154        http_client.clone(),
155        panic_report_url.clone(),
156        installation_id.clone(),
157        cx,
158    );
159
160    cx.observe_new_models(move |project: &mut Project, cx| {
161        let http_client = http_client.clone();
162        let panic_report_url = panic_report_url.clone();
163        let session_id = session_id.clone();
164        let installation_id = installation_id.clone();
165        let system_id = system_id.clone();
166
167        if let Some(ssh_client) = project.ssh_client() {
168            ssh_client.update(cx, |client, cx| {
169                if TelemetrySettings::get_global(cx).diagnostics {
170                    let request = client.proto_client().request(proto::GetPanicFiles {});
171                    cx.background_executor()
172                        .spawn(async move {
173                            let panic_files = request.await?;
174                            for file in panic_files.file_contents {
175                                let panic: Option<Panic> = serde_json::from_str(&file)
176                                    .log_err()
177                                    .or_else(|| {
178                                        file.lines()
179                                            .next()
180                                            .and_then(|line| serde_json::from_str(line).ok())
181                                    })
182                                    .unwrap_or_else(|| {
183                                        log::error!("failed to deserialize panic file {:?}", file);
184                                        None
185                                    });
186
187                                if let Some(mut panic) = panic {
188                                    panic.session_id = session_id.clone();
189                                    panic.system_id = system_id.clone();
190                                    panic.installation_id = installation_id.clone();
191
192                                    upload_panic(&http_client, &panic_report_url, panic, &mut None)
193                                        .await?;
194                                }
195                            }
196
197                            anyhow::Ok(())
198                        })
199                        .detach_and_log_err(cx);
200                }
201            })
202        }
203    })
204    .detach();
205}
206
207#[cfg(target_os = "macos")]
208pub fn monitor_main_thread_hangs(
209    http_client: Arc<HttpClientWithUrl>,
210    installation_id: Option<String>,
211    cx: &AppContext,
212) {
213    // This is too noisy to ship to stable for now.
214    if !matches!(
215        ReleaseChannel::global(cx),
216        ReleaseChannel::Dev | ReleaseChannel::Nightly | ReleaseChannel::Preview
217    ) {
218        return;
219    }
220
221    use nix::sys::signal::{
222        sigaction, SaFlags, SigAction, SigHandler, SigSet,
223        Signal::{self, SIGUSR2},
224    };
225
226    use parking_lot::Mutex;
227
228    use http_client::Method;
229    use std::{
230        ffi::c_int,
231        sync::{mpsc, OnceLock},
232        time::Duration,
233    };
234    use telemetry_events::{BacktraceFrame, HangReport};
235
236    use nix::sys::pthread;
237
238    let foreground_executor = cx.foreground_executor();
239    let background_executor = cx.background_executor();
240    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
241
242    // Initialize SIGUSR2 handler to send a backtrace to a channel.
243    let (backtrace_tx, backtrace_rx) = mpsc::channel();
244    static BACKTRACE: Mutex<Vec<backtrace::Frame>> = Mutex::new(Vec::new());
245    static BACKTRACE_SENDER: OnceLock<mpsc::Sender<()>> = OnceLock::new();
246    BACKTRACE_SENDER.get_or_init(|| backtrace_tx);
247    BACKTRACE.lock().reserve(100);
248
249    fn handle_backtrace_signal() {
250        unsafe {
251            extern "C" fn handle_sigusr2(_i: c_int) {
252                unsafe {
253                    // ASYNC SIGNAL SAFETY: This lock is only accessed one other time,
254                    // which can only be triggered by This signal handler. In addition,
255                    // this signal handler is immediately removed by SA_RESETHAND, and this
256                    // signal handler cannot be re-entrant due to to the SIGUSR2 mask defined
257                    // below
258                    let mut bt = BACKTRACE.lock();
259                    bt.clear();
260                    backtrace::trace_unsynchronized(|frame| {
261                        if bt.len() < bt.capacity() {
262                            bt.push(frame.clone());
263                            true
264                        } else {
265                            false
266                        }
267                    });
268                }
269
270                BACKTRACE_SENDER.get().unwrap().send(()).ok();
271            }
272
273            let mut mask = SigSet::empty();
274            mask.add(SIGUSR2);
275            sigaction(
276                Signal::SIGUSR2,
277                &SigAction::new(
278                    SigHandler::Handler(handle_sigusr2),
279                    SaFlags::SA_RESTART | SaFlags::SA_RESETHAND,
280                    mask,
281                ),
282            )
283            .log_err();
284        }
285    }
286
287    handle_backtrace_signal();
288    let main_thread = pthread::pthread_self();
289
290    let (mut tx, mut rx) = futures::channel::mpsc::channel(3);
291    foreground_executor
292        .spawn(async move { while (rx.next().await).is_some() {} })
293        .detach();
294
295    background_executor
296        .spawn({
297            let background_executor = background_executor.clone();
298            async move {
299                loop {
300                    background_executor.timer(Duration::from_secs(1)).await;
301                    match tx.try_send(()) {
302                        Ok(_) => continue,
303                        Err(e) => {
304                            if e.into_send_error().is_full() {
305                                pthread::pthread_kill(main_thread, SIGUSR2).log_err();
306                            }
307                            // Only detect the first hang
308                            break;
309                        }
310                    }
311                }
312            }
313        })
314        .detach();
315
316    let app_version = release_channel::AppVersion::global(cx);
317    let os_name = client::telemetry::os_name();
318
319    background_executor
320        .clone()
321        .spawn(async move {
322            let os_version = client::telemetry::os_version();
323
324            loop {
325                while backtrace_rx.recv().is_ok() {
326                    if !telemetry_settings.diagnostics {
327                        return;
328                    }
329
330                    // ASYNC SIGNAL SAFETY: This lock is only accessed _after_
331                    // the backtrace transmitter has fired, which itself is only done
332                    // by the signal handler. And due to SA_RESETHAND  the signal handler
333                    // will not run again until `handle_backtrace_signal` is called.
334                    let raw_backtrace = BACKTRACE.lock().drain(..).collect::<Vec<_>>();
335                    let backtrace: Vec<_> = raw_backtrace
336                        .into_iter()
337                        .map(|frame| {
338                            let mut btf = BacktraceFrame {
339                                ip: frame.ip() as usize,
340                                symbol_addr: frame.symbol_address() as usize,
341                                base: frame.module_base_address().map(|addr| addr as usize),
342                                symbols: vec![],
343                            };
344
345                            backtrace::resolve_frame(&frame, |symbol| {
346                                if let Some(name) = symbol.name() {
347                                    btf.symbols.push(name.to_string());
348                                }
349                            });
350
351                            btf
352                        })
353                        .collect();
354
355                    // IMPORTANT: Don't move this to before `BACKTRACE.lock()`
356                    handle_backtrace_signal();
357
358                    log::error!(
359                        "Suspected hang on main thread:\n{}",
360                        backtrace
361                            .iter()
362                            .flat_map(|bt| bt.symbols.first().as_ref().map(|s| s.as_str()))
363                            .collect::<Vec<_>>()
364                            .join("\n")
365                    );
366
367                    let report = HangReport {
368                        backtrace,
369                        app_version: Some(app_version),
370                        os_name: os_name.clone(),
371                        os_version: Some(os_version.clone()),
372                        architecture: env::consts::ARCH.into(),
373                        installation_id: installation_id.clone(),
374                    };
375
376                    let Some(json_bytes) = serde_json::to_vec(&report).log_err() else {
377                        continue;
378                    };
379
380                    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes)
381                    else {
382                        continue;
383                    };
384
385                    let Ok(url) = http_client.build_zed_api_url("/telemetry/hangs", &[]) else {
386                        continue;
387                    };
388
389                    let Ok(request) = http_client::Request::builder()
390                        .method(Method::POST)
391                        .uri(url.as_ref())
392                        .header("x-zed-checksum", checksum)
393                        .body(json_bytes.into())
394                    else {
395                        continue;
396                    };
397
398                    if let Some(response) = http_client.send(request).await.log_err() {
399                        if response.status() != 200 {
400                            log::error!("Failed to send hang report: HTTP {:?}", response.status());
401                        }
402                    }
403                }
404            }
405        })
406        .detach()
407}
408
409fn upload_panics_and_crashes(
410    http: Arc<HttpClientWithUrl>,
411    panic_report_url: Url,
412    installation_id: Option<String>,
413    cx: &AppContext,
414) {
415    let telemetry_settings = *client::TelemetrySettings::get_global(cx);
416    cx.background_executor()
417        .spawn(async move {
418            let most_recent_panic =
419                upload_previous_panics(http.clone(), &panic_report_url, telemetry_settings)
420                    .await
421                    .log_err()
422                    .flatten();
423            upload_previous_crashes(http, most_recent_panic, installation_id, telemetry_settings)
424                .await
425                .log_err()
426        })
427        .detach()
428}
429
430/// Uploads panics via `zed.dev`.
431async fn upload_previous_panics(
432    http: Arc<HttpClientWithUrl>,
433    panic_report_url: &Url,
434    telemetry_settings: client::TelemetrySettings,
435) -> anyhow::Result<Option<(i64, String)>> {
436    let mut children = smol::fs::read_dir(paths::logs_dir()).await?;
437
438    let mut most_recent_panic = None;
439
440    while let Some(child) = children.next().await {
441        let child = child?;
442        let child_path = child.path();
443
444        if child_path.extension() != Some(OsStr::new("panic")) {
445            continue;
446        }
447        let filename = if let Some(filename) = child_path.file_name() {
448            filename.to_string_lossy()
449        } else {
450            continue;
451        };
452
453        if !filename.starts_with("zed") {
454            continue;
455        }
456
457        if telemetry_settings.diagnostics {
458            let panic_file_content = smol::fs::read_to_string(&child_path)
459                .await
460                .context("error reading panic file")?;
461
462            let panic: Option<Panic> = serde_json::from_str(&panic_file_content)
463                .log_err()
464                .or_else(|| {
465                    panic_file_content
466                        .lines()
467                        .next()
468                        .and_then(|line| serde_json::from_str(line).ok())
469                })
470                .unwrap_or_else(|| {
471                    log::error!("failed to deserialize panic file {:?}", panic_file_content);
472                    None
473                });
474
475            if let Some(panic) = panic {
476                if !upload_panic(&http, &panic_report_url, panic, &mut most_recent_panic).await? {
477                    continue;
478                }
479            }
480        }
481
482        // We've done what we can, delete the file
483        std::fs::remove_file(child_path)
484            .context("error removing panic")
485            .log_err();
486    }
487    Ok(most_recent_panic)
488}
489
490async fn upload_panic(
491    http: &Arc<HttpClientWithUrl>,
492    panic_report_url: &Url,
493    panic: telemetry_events::Panic,
494    most_recent_panic: &mut Option<(i64, String)>,
495) -> Result<bool> {
496    *most_recent_panic = Some((panic.panicked_on, panic.payload.clone()));
497
498    let json_bytes = serde_json::to_vec(&PanicRequest { panic }).unwrap();
499
500    let Some(checksum) = client::telemetry::calculate_json_checksum(&json_bytes) else {
501        return Ok(false);
502    };
503
504    let Ok(request) = http_client::Request::builder()
505        .method(Method::POST)
506        .uri(panic_report_url.as_ref())
507        .header("x-zed-checksum", checksum)
508        .body(json_bytes.into())
509    else {
510        return Ok(false);
511    };
512
513    let response = http.send(request).await.context("error sending panic")?;
514    if !response.status().is_success() {
515        log::error!("Error uploading panic to server: {}", response.status());
516    }
517
518    Ok(true)
519}
520const LAST_CRASH_UPLOADED: &str = "LAST_CRASH_UPLOADED";
521
522/// upload crashes from apple's diagnostic reports to our server.
523/// (only if telemetry is enabled)
524async fn upload_previous_crashes(
525    http: Arc<HttpClientWithUrl>,
526    most_recent_panic: Option<(i64, String)>,
527    installation_id: Option<String>,
528    telemetry_settings: client::TelemetrySettings,
529) -> Result<()> {
530    if !telemetry_settings.diagnostics {
531        return Ok(());
532    }
533    let last_uploaded = KEY_VALUE_STORE
534        .read_kvp(LAST_CRASH_UPLOADED)?
535        .unwrap_or("zed-2024-01-17-221900.ips".to_string()); // don't upload old crash reports from before we had this.
536    let mut uploaded = last_uploaded.clone();
537
538    let crash_report_url = http.build_zed_api_url("/telemetry/crashes", &[])?;
539
540    // Crash directories are only set on macOS.
541    for dir in [crashes_dir(), crashes_retired_dir()]
542        .iter()
543        .filter_map(|d| d.as_deref())
544    {
545        let mut children = smol::fs::read_dir(&dir).await?;
546        while let Some(child) = children.next().await {
547            let child = child?;
548            let Some(filename) = child
549                .path()
550                .file_name()
551                .map(|f| f.to_string_lossy().to_lowercase())
552            else {
553                continue;
554            };
555
556            if !filename.starts_with("zed-") || !filename.ends_with(".ips") {
557                continue;
558            }
559
560            if filename <= last_uploaded {
561                continue;
562            }
563
564            let body = smol::fs::read_to_string(&child.path())
565                .await
566                .context("error reading crash file")?;
567
568            let mut request = http_client::Request::post(&crash_report_url.to_string())
569                .follow_redirects(http_client::RedirectPolicy::FollowAll)
570                .header("Content-Type", "text/plain");
571
572            if let Some((panicked_on, payload)) = most_recent_panic.as_ref() {
573                request = request
574                    .header("x-zed-panicked-on", format!("{panicked_on}"))
575                    .header("x-zed-panic", payload)
576            }
577            if let Some(installation_id) = installation_id.as_ref() {
578                request = request.header("x-zed-installation-id", installation_id);
579            }
580
581            let request = request.body(body.into())?;
582
583            let response = http.send(request).await.context("error sending crash")?;
584            if !response.status().is_success() {
585                log::error!("Error uploading crash to server: {}", response.status());
586            }
587
588            if uploaded < filename {
589                uploaded.clone_from(&filename);
590                KEY_VALUE_STORE
591                    .write_kvp(LAST_CRASH_UPLOADED.to_string(), filename)
592                    .await?;
593            }
594        }
595    }
596
597    Ok(())
598}